CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_4_4_5_patch3/src/DataFormats/Math/interface/SSEVec.h

Go to the documentation of this file.
00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003 
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007 #ifdef __AVX__
00008 #define CMS_USE_AVX
00009 #endif
00010 #else
00011 
00012 #ifdef __SSE2__
00013 #define CMS_USE_SSE
00014 
00015 #include <mmintrin.h>
00016 #include <emmintrin.h>
00017 #endif
00018 #ifdef __SSE3__
00019 #include <pmmintrin.h>
00020 #endif
00021 #ifdef __SSE4_1__
00022 #include <smmintrin.h>
00023 #endif
00024 
00025 #endif
00026 
00027 #include<cmath>
00028 
00029 // needed fro gcc < 4.6
00030 namespace mathSSE {
00031   struct ZeroUpper {
00032     ZeroUpper() {
00033 #ifdef __AVX__
00034     _mm256_zeroupper();
00035 #endif
00036     }
00037    ~ZeroUpper() {
00038 #ifdef __AVX__
00039     _mm256_zeroupper();
00040 #endif
00041     }
00042   };
00043 }
00044 
00045 namespace mathSSE {
00046   template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00047 }
00048 
00049 namespace mathSSE {
00050   //
00051   template<typename T> inline bool samesign(T rh, T lh);
00052 
00053   template<>
00054   inline bool
00055   __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00056     int const mask= 0x80000000;
00057     return ((rh^lh)&mask) == 0;
00058   }
00059 
00060   template<>
00061   inline bool
00062   __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00063     long long const mask= 0x8000000000000000LL;
00064     return ((rh^lh)&mask) == 0;
00065   }
00066 
00067   template<>
00068   inline bool
00069   __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00070     union { int i; float f; } a, b;
00071     a.f=rh; b.f=lh;
00072     return samesign<int>(a.i,b.i);
00073   }
00074 
00075   template<>
00076   inline bool
00077   __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00078     union { long long i; double f; } a, b;
00079     a.f=rh; b.f=lh;
00080     return samesign<long long>(a.i,b.i);
00081   }
00082 }
00083 
00084 
00085 namespace mathSSE {
00086 #ifdef  CMS_USE_SSE
00087   //dot
00088   inline __m128 
00089   __attribute__((always_inline)) __attribute__ ((pure))
00090   _mm_dot_ps(__m128 v1, __m128 v2) {
00091 #ifdef __SSE4_1__
00092     return _mm_dp_ps(v1, v2, 0xff);
00093 #else
00094     __m128 mul = _mm_mul_ps(v1, v2);
00095 #ifdef __SSE3__
00096     mul = _mm_hadd_ps(mul,mul);
00097     return _mm_hadd_ps(mul,mul);
00098 #else
00099     __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00100     mul = _mm_add_ps(mul, swp);
00101     swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00102     return _mm_add_ps(mul, swp);
00103 #endif
00104 #endif
00105   }
00106   
00107 
00108   // cross (just 3x3) 
00109   inline __m128 
00110   __attribute__((always_inline)) __attribute__ ((pure))
00111   _mm_cross_ps(__m128 v1, __m128 v2) {
00112     // same order is  _MM_SHUFFLE(3,2,1,0)
00113     //                                               x2, z1,z1
00114     __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00115     //                                               y1, x2,y2
00116     __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00117     
00118     __m128 v5 = _mm_mul_ps(v3, v4);
00119     
00120     //                                         x1, z2,z2
00121     v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00122     //                                        y2, x1,y1
00123     v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00124     
00125     v3 = _mm_mul_ps(v3, v4);
00126     const  __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00127     return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00128   }
00129 #endif // CMS_USE_SSE
00130 
00131 #ifdef  CMS_USE_AVX
00132   inline __m256d  
00133   __attribute__((always_inline)) __attribute__ ((pure))
00134   _mm256_dot_pd(__m256d v1, __m256d v2) {
00135     __m256d mul = _mm256_mul_pd(v1, v2);
00136     mul = _mm256_hadd_pd(mul,mul);
00137     __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
00138     return _mm256_add_pd(mul,tmp);
00139   }
00140 
00141   inline __m256d  
00142   __attribute__((always_inline)) __attribute__ ((pure))
00143   _mm256_cross_pd(__m256d v1, __m256d v2) {
00144     
00145     __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
00146     v3 = _mm256_permute_pd(v3,0);
00147     
00148     __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
00149     v4 = _mm256_permute_pd(v4,5);
00150     
00151     __m256d v5 = _mm256_mul_pd(v3, v4);
00152     
00153     v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
00154     v3 = _mm256_permute_pd(v3,0);
00155     
00156     v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
00157     v4 = _mm256_permute_pd(v4,5);
00158     
00159     v3 = _mm256_mul_pd(v3, v4);
00160     const  __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
00161     return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
00162  
00163   }
00164 
00165 #endif //  CMS_USE_AVX
00166 
00167 
00168 
00169   template<typename T>
00170   struct OldVec { T  theX; T  theY; T  theZ; T  theW;}  __attribute__ ((aligned (16)));
00171 #ifdef  CMS_USE_AVX
00172   template<>
00173   struct OldVec<double> { double  theX; double  theY; double  theZ; double  theW;}  __attribute__ ((aligned (32)));
00174 #endif
00175   
00176   template<typename T> union Vec4;
00177 
00178   template<typename T> union Vec2{
00179     Vec2() {
00180       arr[0] = 0; arr[1] = 0;
00181     }
00182     Vec2(T f1, T f2) {
00183       arr[0] = f1; arr[1] = f2;
00184     }
00185     explicit Vec2(T f1) {
00186       arr[0] = f1; arr[1] = f1;
00187     }
00188 
00189     void set(T f1, T f2) {
00190       arr[0] = f1; arr[1] = f2;
00191     }
00192 
00193     template <int N>
00194     Vec2 get1() const { 
00195       return Vec2(arr[N],arr[N]);
00196     }
00197 
00198     /*
00199     Vec2 get1(unsigned int n) const {
00200       return Vec2(arr[n],arr[n]);
00201     }
00202     */
00203 
00204     template<typename U> 
00205     Vec2(Vec2<U> v) {
00206       arr[0] = v[0]; arr[1] = v[1];
00207 
00208     }
00209 
00210     inline Vec2(Vec4<T> v4);
00211 
00212     T & operator[](unsigned int n) {
00213       return arr[n];
00214     }
00215 
00216     T operator[](unsigned int n) const {
00217       return arr[n];
00218     }
00219 
00220 
00221     T arr[2];
00222   };
00223 
00224 
00225   template<typename T> union Vec4{
00226     Vec4() {
00227       arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00228     }
00229     Vec4(float f1, float f2, float f3, float f4=0) {
00230       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00231     }
00232     explicit Vec4(float f1) {
00233       set1(f1);
00234     }
00235     void set(float f1, float f2, float f3, float f4=0) {
00236       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00237     }
00238     void set1(float f1) {
00239       arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00240     }
00241    template <int N>
00242     Vec4 get1() const { 
00243      return Vec4(arr[N],arr[N],arr[N],arr[N]);
00244    }
00245     /*
00246     Vec4 get1(unsigned int n) const {
00247       return Vec4(arr[n],arr[n],arr[n],arr[n]);
00248     }
00249     */
00250 
00251     Vec2<T> xy() const { return  Vec2<T>(arr[0],arr[1]);}
00252     Vec2<T> zw() const { return  Vec2<T>(arr[2],arr[3]);}
00253 
00254 
00255 
00256     T __attribute__ ((aligned(16))) arr[4];
00257     OldVec<T> o;
00258   };
00259 
00260   template<typename T>
00261   inline Vec2<T>::Vec2(Vec4<T> v4) {
00262     arr[0]=v4[0];arr[1]=v4[1];
00263   }
00264 
00265 
00266 #ifdef CMS_USE_SSE
00267 
00268   template<>
00269   union Vec2<double>;
00270   template<>
00271   union Vec4<double>;
00272 
00273   template<typename T> union Mask2{};
00274   template<typename T> union Mask4{};
00275 
00276   template<>
00277   union Mask4<float> {
00278     __m128 vec;
00279     unsigned int __attribute__ ((aligned(16))) mask[4];
00280     Mask4() {vec = _mm_setzero_ps();}
00281     Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00282       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00283     }
00284   };
00285   
00286 #ifdef  CMS_USE_AVX
00287   template<>
00288   union Mask4<double> {
00289     __m256d vec;
00290     unsigned long long __attribute__ ((aligned(32))) mask[4];
00291     Mask4() {
00292       vec = _mm256_setzero_pd();
00293      }
00294     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00295       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00296     }
00297   };
00298 #else
00299   template<>
00300   union Mask4<double> {
00301     __m128d vec[2];
00302     unsigned long long __attribute__ ((aligned(16))) mask[4];
00303     Mask4() {
00304       vec[0] = _mm_setzero_pd();
00305       vec[1] = _mm_setzero_pd();
00306     }
00307     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00308       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00309     }
00310   };
00311 #endif
00312 
00313   template<>
00314   union Mask2<double> {
00315     __m128d vec;
00316     unsigned long long __attribute__ ((aligned(16))) mask[2];
00317     Mask2() {
00318       vec = _mm_setzero_pd();
00319     }
00320     Mask2(unsigned long long m1, unsigned long long m2) {
00321       mask[0]=m1;  mask[1]=m2; 
00322     }
00323   };
00324 
00325   template<>
00326   union Vec4<float> {
00327     typedef  __m128 nativeType;
00328     __m128 vec;
00329     float __attribute__ ((aligned(16))) arr[4];
00330     OldVec<float> o;
00331     
00332     Vec4(__m128 ivec) : vec(ivec) {}
00333 
00334     Vec4(OldVec<float> const & ivec) : o(ivec) {}
00335     
00336     Vec4() {
00337       vec = _mm_setzero_ps();
00338     }
00339 
00340 
00341     inline Vec4(Vec4<double> ivec);
00342 
00343     explicit Vec4(float f1) {
00344       set1(f1);
00345     }
00346 
00347     Vec4(float f1, float f2, float f3, float f4=0) {
00348       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00349     }
00350 
00351 
00352     Vec4( Vec2<float> ivec0,   Vec2<float> ivec1) {
00353       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00354       arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00355     }
00356     
00357     Vec4( Vec2<float> ivec0,  float f3, float f4=0) {
00358       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00359       arr[2] = f3; arr[3] = f4;
00360     }
00361 
00362    Vec4( Vec2<float> ivec0) {
00363      vec = _mm_setzero_ps();
00364      arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00365     }
00366 
00367 
00368     // for masking
00369     void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00370       Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec; 
00371     }
00372 
00373     void set(float f1, float f2, float f3, float f4=0) {
00374       vec = _mm_set_ps(f4, f3, f2, f1);
00375     }
00376 
00377     void set1(float f1) {
00378      vec =  _mm_set1_ps(f1);
00379     }
00380 
00381     template <int N>
00382     Vec4 get1() const { 
00383       return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N)); 
00384     }
00385 
00386     float & operator[](unsigned int n) {
00387       return arr[n];
00388     }
00389 
00390     float operator[](unsigned int n) const {
00391       return arr[n];
00392     }
00393     
00394     Vec2<float> xy() const { return  Vec2<float>(arr[0],arr[1]);}
00395     Vec2<float> zw() const { return  Vec2<float>(arr[2],arr[3]);}
00396 
00397   };
00398   
00399 
00400   template<>
00401   union Vec2<double> {
00402     typedef  __m128d nativeType;
00403     __m128d vec;
00404     double __attribute__ ((aligned(16))) arr[2];
00405         
00406     Vec2(__m128d ivec) : vec(ivec) {}
00407     
00408     Vec2() {
00409       vec = _mm_setzero_pd();
00410     }
00411 
00412 
00413     inline Vec2(Vec2<float> ivec);
00414 
00415 
00416     Vec2(double f1, double f2) {
00417       arr[0] = f1; arr[1] = f2;
00418     }
00419 
00420     explicit Vec2(double f1) {
00421       set1(f1);
00422     }
00423     
00424     inline Vec2(Vec4<double> v4); 
00425 
00426    // for masking
00427    void setMask(unsigned long long m1, unsigned long long m2) {
00428      Mask2<double> mask(m1,m2); vec=mask.vec; 
00429    }
00430 
00431 
00432     void set(double f1, double f2) {
00433       arr[0] = f1; arr[1] = f2;
00434     }
00435 
00436     void set1(double f1) {
00437       vec = _mm_set1_pd(f1);
00438     }
00439 
00440     template <int N>
00441     Vec2 get1() const { 
00442       return Vec2(arr[N],arr[N]);
00443     }
00444     /*
00445     Vec2 get1(unsigned int n) const {
00446       return Vec2(arr[n],arr[n]);
00447     }
00448     */
00449     double operator[](unsigned int n) const {
00450       return arr[n];
00451     }
00452   };
00453  
00454 
00455 
00456 
00457 #ifdef  CMS_USE_AVX
00458 }// namespace mathSSE
00459 #include "AVXVec.h"
00460 
00461 namespace mathSSE {
00462 #else
00463   template<>
00464   union Vec4<double> {
00465     __m128d vec[2];
00466     double __attribute__ ((aligned(16))) arr[4];
00467     OldVec<double> o;
00468     
00469     Vec4(__m128d ivec[]) {
00470       vec[0] = ivec[0];
00471       vec[1] = ivec[1];
00472     }
00473     
00474     Vec4(__m128d ivec0, __m128d ivec1) {
00475       vec[0] = ivec0;
00476       vec[1] = ivec1;
00477     }
00478     
00479     Vec4() {
00480       vec[0] = _mm_setzero_pd();
00481       vec[1] = _mm_setzero_pd();
00482     }
00483 
00484     Vec4( Vec4<float> ivec) {
00485       vec[0] = _mm_cvtps_pd(ivec.vec);
00486       vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00487     }
00488 
00489     explicit Vec4(double f1) {
00490       set1(f1);
00491     }
00492 
00493     Vec4(double f1, double f2, double f3, double f4=0) {
00494       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00495     }
00496     
00497    Vec4( Vec2<double> ivec0,   Vec2<double> ivec1) {
00498       vec[0] = ivec0.vec;
00499       vec[1] = ivec1.vec;
00500     }
00501     
00502     Vec4( Vec2<double> ivec0,  double f3, double f4=0) {
00503       vec[0] = ivec0.vec;
00504       arr[2] = f3; arr[3] = f4;
00505     }
00506 
00507    Vec4( Vec2<double> ivec0) {
00508       vec[0] = ivec0.vec;
00509       vec[1] =  _mm_setzero_pd();
00510     }
00511 
00512 
00513     Vec4(OldVec<double> const & ivec) : o(ivec) {}
00514 
00515     // for masking
00516     void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00517       Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1]; 
00518     }
00519 
00520 
00521     void set(double f1, double f2, double f3, double f4=0) {
00522       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00523     }
00524 
00525     void set1(double f1) {
00526       vec[0] = vec[1]= _mm_set1_pd(f1);
00527     }
00528 
00529 
00530     template<int N>
00531     Vec4 get1() const {
00532       return Vec4(arr[N],arr[N],arr[N],arr[N]);
00533     }
00534 
00535     double & operator[](unsigned int n) {
00536       return arr[n];
00537     }
00538 
00539     double operator[](unsigned int n) const {
00540       return arr[n];
00541     }
00542   
00543     Vec2<double> xy() const { return vec[0];}
00544     Vec2<double> zw() const { return vec[1];}
00545 
00546   };
00547 
00548   
00549   inline Vec4<float>::Vec4(Vec4<double> ivec) {
00550     vec = _mm_cvtpd_ps(ivec.vec[0]);
00551     __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00552     vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00553   }
00554 #endif  // CMS_USE_AVX
00555 
00556 
00557 #endif // CMS_USE_SSE
00558   
00559   typedef Vec2<float> Vec2F;
00560   typedef Vec4<float> Vec4F;
00561   typedef Vec4<float> Vec3F;
00562   typedef Vec2<double> Vec2D;
00563   typedef Vec4<double> Vec3D;
00564   typedef Vec4<double> Vec4D;
00565 
00566   template<typename T>
00567   struct As3D {
00568     Vec4<T> const & v;
00569     As3D(Vec4<T> const &iv ) : v(iv){}
00570   };
00571 
00572   template<typename T>
00573   inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00574 
00575 } // namespace mathSSE
00576 
00577 #ifdef CMS_USE_SSE
00578 
00579 
00580 //float op
00581 
00582 
00583 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00584   return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00585 }
00586 
00587 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00588   return _mm_cmpeq_ps(a.vec,b.vec);
00589 }
00590 
00591 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00592   return _mm_cmpgt_ps(a.vec,b.vec);
00593 }
00594 
00595 #ifdef __SSE3__
00596 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00597   return _mm_hadd_ps(a.vec,b.vec);
00598 }
00599 #endif
00600 
00601 
00602 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00603   const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00604   return _mm_xor_ps(a.vec,neg);
00605 }
00606 
00607 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00608   return  _mm_and_ps(a.vec,b.vec);
00609 }
00610 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00611   return  _mm_or_ps(a.vec,b.vec);
00612 }
00613 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00614   return  _mm_xor_ps(a.vec,b.vec);
00615 }
00616 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00617   return  _mm_andnot_ps(a.vec,b.vec);
00618 }
00619 
00620 
00621 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00622   return  _mm_add_ps(a.vec,b.vec);
00623 }
00624 
00625 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00626   return  _mm_sub_ps(a.vec,b.vec);
00627 }
00628 
00629 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00630   return  _mm_mul_ps(a.vec,b.vec);
00631 }
00632 
00633 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00634   return  _mm_div_ps(a.vec,b.vec);
00635 }
00636 
00637 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00638   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00639 }
00640 
00641 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00642   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00643 }
00644 
00645 
00646 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00647   using  mathSSE::_mm_dot_ps;
00648   float s;
00649   _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00650   return s;
00651 }
00652 
00653 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00654   using  mathSSE::_mm_cross_ps;
00655   return _mm_cross_ps(a.vec,b.vec);
00656 }
00657 
00658 
00659 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00660   mathSSE::Vec4F mul = a*b;
00661 #ifdef __SSE3__
00662    mul = hadd(mul,mul);
00663 #else
00664    __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
00665    mul.vec = _mm_add_ps(mul.vec, swp);
00666 #endif
00667   float s;
00668   _mm_store_ss(&s,mul.vec);
00669   return s;
00670 }
00671 
00672 
00673 //
00674 // float op 2d (use the 4d one...)
00675 //
00676 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00677   return -mathSSE::Vec4F(a);
00678 }
00679 
00680 
00681 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00682   return  mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00683 }
00684 
00685 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00686   return  mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00687 }
00688 
00689 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00690   return  mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00691 }
00692 
00693 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00694   return  mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00695 }
00696 
00697 
00698 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00699   return  s*mathSSE::Vec4F(a);
00700 }
00701 
00702 inline mathSSE::Vec4F operator*(float s,mathSSE::Vec2F a) {
00703   return  s*mathSSE::Vec4F(a);
00704 }
00705 
00706 
00707 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00708 
00709 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00710   return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00711 }
00712 
00713 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00714 
00715 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00716   return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00717 }
00718 
00719 
00721 // double op 2d
00722 //
00723 
00724 inline  mathSSE::Vec2D::Vec2(Vec4D v4) {
00725 #ifdef  CMS_USE_AVX
00726   vec = _mm256_castpd256_pd128(v4.vec);
00727 #else
00728   vec = v4.vec[0];
00729 #endif
00730 }
00731 
00732 inline  mathSSE::Vec2D::Vec2(Vec2F ivec) {
00733   arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00734 }
00735 
00736 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00737   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00738   return _mm_xor_pd(a.vec,neg);
00739 }
00740 
00741 
00742 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00743   return  _mm_and_pd(a.vec,b.vec);
00744 }
00745 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00746   return  _mm_or_pd(a.vec,b.vec);
00747 }
00748 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00749   return  _mm_xor_pd(a.vec,b.vec);
00750 }
00751 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00752   return  _mm_andnot_pd(a.vec,b.vec);
00753 }
00754 
00755 #ifdef __SSE3__
00756 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00757   return _mm_hadd_pd(a.vec,b.vec);
00758 }
00759 #endif
00760 
00761 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00762   return  _mm_add_pd(a.vec,b.vec);
00763 }
00764 
00765 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00766   return  _mm_sub_pd(a.vec,b.vec);
00767 }
00768 
00769 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00770   return  _mm_mul_pd(a.vec,b.vec);
00771 }
00772 
00773 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00774   return  _mm_div_pd(a.vec,b.vec);
00775 }
00776 
00777 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00778   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00779 }
00780 
00781 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00782   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00783 }
00784 
00785 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00786 
00787 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00788   __m128d res = _mm_mul_pd ( a.vec, b.vec);
00789 #ifdef __SSE3__
00790     res = _mm_hadd_pd(res,res);
00791 #else
00792   res = _mm_add_sd (  _mm_shuffle_pd ( res , res, 1 ), res );
00793 #endif
00794   double s;
00795   _mm_store_sd(&s,res);
00796   return s;
00797 }
00798 
00799 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00800 
00801 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00802   __m128d res =  _mm_shuffle_pd ( b.vec, b.vec, 1);
00803   res = _mm_mul_pd (  a.vec , res );
00804   res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00805   double s;
00806   _mm_store_sd(&s,res);
00807   return s;
00808 }
00809 
00810 
00811 #ifndef  CMS_USE_AVX
00812 // double op 3d
00813 
00814 
00815 #ifdef __SSE3__
00816 // consistent with AVX...
00817 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00818   return  mathSSE::Vec4D(hadd(a.vec[0],b.vec[0]),hadd(a.vec[1],b.vec[1]) );
00819 }
00820 #endif
00821 
00822 
00823 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00824   return 
00825     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 && 
00826     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00827 }
00828 
00829 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00830   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00831   return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00832 }
00833 
00834 
00835 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00836   return  mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00837 }
00838 
00839 
00840 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00841   return  mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00842 }
00843 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00844   return  mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00845 }
00846 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00847   return  mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00848 }
00849 
00850 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00851   __m128d res = _mm_set1_pd(a);
00852   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00853 }
00854 
00855 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00856   __m128d res = _mm_set1_pd(a);
00857   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00858 }
00859 
00860 // mix algebra (creates ambiguities...)
00861 /*
00862 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00863   return a + mathSSE::Vec4D(b);
00864 }
00865 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00866   return a + mathSSE::Vec4D(b);
00867 }
00868 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00869   return a - mathSSE::Vec4D(b);
00870 }
00871 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00872   return a - mathSSE::Vec4D(b);
00873 }
00874 */
00875 
00876 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00877 
00878 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00879   __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00880                              _mm_mul_sd ( a.vec[1], b.vec[1]) 
00881                              );
00882   res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00883   double s;
00884   _mm_store_sd(&s,res);
00885   return s;
00886 }
00887 
00888 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00889  
00890 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00891   const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00892   // lh .z * rh .x , lh .z * rh .y
00893   __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00894   // rh .z * lh .x , rh .z * lh .y
00895   __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd (  b.vec[1],  b.vec[1] ),  a.vec[0] );
00896   __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
00897   m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
00898   m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
00899   // lh .x * rh .y , lh .y * rh .x
00900   l1 = _mm_mul_pd (  a.vec[0] , _mm_shuffle_pd (  b.vec[0] ,  b.vec[0] , 1 ) );
00901   // lh .x * rh .y - lh .y * rh .x
00902   __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00903 
00904   return  mathSSE::Vec4D( m1 , m2 );
00905 }
00906 
00907 
00908 inline double  
00909 __attribute__((always_inline)) __attribute__ ((pure)) 
00910 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00911   return dot(a.xy(),b.xy());
00912 }
00913 
00914 #endif   // CMS_USE_AVX
00915 
00916 
00917 // sqrt
00918 namespace mathSSE {
00919   template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00920   template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00921   template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00922 #ifdef  CMS_USE_AVX
00923   template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
00924 #else
00925   template<> inline Vec4D sqrt(Vec4D v) { 
00926     return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00927   }
00928 #endif
00929 }
00930 
00931 // chephes func
00932 #include "DataFormats/Math/interface/sse_mathfun.h"
00933 namespace mathSSE {
00934   inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00935   inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00936   inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00937   inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00938   inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00939 
00940   inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00941   inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
00942   inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
00943   inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00944   inline void sincos(float f, float & s, float & c) { 
00945     __m128 vs, vc; 
00946     sincos_ps(_mm_load_ss(&f),&vs, &vc);   
00947     _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);   
00948   }
00949 }
00950 #endif // CMS_USE_SSE
00951 
00952 
00953 #include <iosfwd>
00954 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
00955 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
00956 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
00957 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
00958 
00959 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
00960 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
00961 
00962 
00963 #endif // DataFormat_Math_SSEVec_H