00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007
00008 #else
00009
00010 #ifdef __SSE2__
00011 #define CMS_USE_SSE
00012
00013 #include <mmintrin.h>
00014 #include <emmintrin.h>
00015 #endif
00016 #ifdef __SSE3__
00017 #include <pmmintrin.h>
00018 #endif
00019 #ifdef __SSE4_1__
00020 #include <smmintrin.h>
00021 #endif
00022
00023 #endif
00024
00025 #include<cmath>
00026
00027 namespace mathSSE {
00028 template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00029 }
00030
00031 namespace mathSSE {
00032
00033 template<typename T> inline bool samesign(T rh, T lh);
00034
00035 template<>
00036 inline bool
00037 __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00038 int const mask= 0x80000000;
00039 return ((rh^lh)&mask) == 0;
00040 }
00041
00042 template<>
00043 inline bool
00044 __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00045 long long const mask= 0x8000000000000000LL;
00046 return ((rh^lh)&mask) == 0;
00047 }
00048
00049 template<>
00050 inline bool
00051 __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00052 union { int i; float f; } a, b;
00053 a.f=rh; b.f=lh;
00054 return samesign<int>(a.i,b.i);
00055 }
00056
00057 template<>
00058 inline bool
00059 __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00060 union { long long i; double f; } a, b;
00061 a.f=rh; b.f=lh;
00062 return samesign<long long>(a.i,b.i);
00063 }
00064 }
00065
00066
00067 namespace mathSSE {
00068 #ifdef CMS_USE_SSE
00069
00070 inline __m128 _mm_dot_ps(__m128 v1, __m128 v2) {
00071 #ifdef __SSE4_1__
00072 return _mm_dp_ps(v1, v2, 0xff);
00073 #else
00074 __m128 mul = _mm_mul_ps(v1, v2);
00075 #ifdef __SSE3__
00076 mul = _mm_hadd_ps(mul,mul);
00077 return _mm_hadd_ps(mul,mul);
00078 #else
00079 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00080 mul = _mm_add_ps(mul, swp);
00081 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00082 return _mm_add_ps(mul, swp);
00083 #endif
00084 #endif
00085 }
00086
00087
00088
00089 inline __m128 _mm_cross_ps(__m128 v1, __m128 v2) {
00090 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00091 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00092
00093 __m128 v5 = _mm_mul_ps(v3, v4);
00094
00095 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00096 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00097
00098 v3 = _mm_mul_ps(v3, v4);
00099 const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00100 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00101 }
00102
00103
00104 #endif // CMS_USE_SSE
00105
00106
00107 template<typename T>
00108 struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
00109
00110 template<typename T> union Vec4;
00111
00112 template<typename T> union Vec2{
00113 Vec2() {
00114 arr[0] = 0; arr[1] = 0;
00115 }
00116 Vec2(T f1, T f2) {
00117 arr[0] = f1; arr[1] = f2;
00118 }
00119 explicit Vec2(T f1) {
00120 arr[0] = f1; arr[1] = f1;
00121 }
00122
00123 void set(T f1, T f2) {
00124 arr[0] = f1; arr[1] = f2;
00125 }
00126 Vec2 get1(unsigned int n) const {
00127 return Vec2(arr[n],arr[n]);
00128 }
00129
00130 template<typename U>
00131 Vec2(Vec2<U> v) {
00132 arr[0] = v[0]; arr[1] = v[1];
00133
00134 }
00135
00136 inline Vec2(Vec4<T> v4);
00137
00138 T & operator[](unsigned int n) {
00139 return arr[n];
00140 }
00141
00142 T operator[](unsigned int n) const {
00143 return arr[n];
00144 }
00145
00146
00147 T arr[2];
00148 };
00149
00150
00151 template<typename T> union Vec4{
00152 Vec4() {
00153 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00154 }
00155 Vec4(float f1, float f2, float f3, float f4=0) {
00156 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00157 }
00158 explicit Vec4(float f1) {
00159 set1(f1);
00160 }
00161 void set(float f1, float f2, float f3, float f4=0) {
00162 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00163 }
00164 void set1(float f1) {
00165 arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00166 }
00167 Vec4 get1(unsigned int n) const {
00168 return Vec4(arr[n],arr[n],arr[n],arr[n]);
00169 }
00170
00171 Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
00172 Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
00173
00174
00175
00176 T __attribute__ ((aligned(16))) arr[4];
00177 OldVec<T> o;
00178 };
00179
00180 template<typename T>
00181 inline Vec2<T>::Vec2(Vec4<T> v4) {
00182 arr[0]=v4[0];arr[1]=v4[1];
00183 }
00184
00185
00186 #ifdef CMS_USE_SSE
00187
00188 template<>
00189 union Vec2<double>;
00190 template<>
00191 union Vec4<double>;
00192
00193 template<typename T> union Mask2{};
00194 template<typename T> union Mask4{};
00195
00196 template<>
00197 union Mask4<float> {
00198 __m128 vec;
00199 unsigned int __attribute__ ((aligned(16))) mask[4];
00200 Mask4() {vec = _mm_setzero_ps();}
00201 Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00202 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00203 }
00204 };
00205
00206 template<>
00207 union Mask4<double> {
00208 __m128d vec[2];
00209 unsigned long long __attribute__ ((aligned(16))) mask[4];
00210 Mask4() {
00211 vec[0] = _mm_setzero_pd();
00212 vec[1] = _mm_setzero_pd();
00213 }
00214 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00215 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00216 }
00217 };
00218
00219
00220 template<>
00221 union Mask2<double> {
00222 __m128d vec;
00223 unsigned long long __attribute__ ((aligned(16))) mask[2];
00224 Mask2() {
00225 vec = _mm_setzero_pd();
00226 }
00227 Mask2(unsigned long long m1, unsigned long long m2) {
00228 mask[0]=m1; mask[1]=m2;
00229 }
00230 };
00231
00232 template<>
00233 union Vec4<float> {
00234 typedef __m128 nativeType;
00235 __m128 vec;
00236 float __attribute__ ((aligned(16))) arr[4];
00237 OldVec<float> o;
00238
00239 Vec4(__m128 ivec) : vec(ivec) {}
00240
00241 Vec4(OldVec<float> const & ivec) : o(ivec) {}
00242
00243 Vec4() {
00244 vec = _mm_setzero_ps();
00245 }
00246
00247
00248 inline Vec4(Vec4<double> ivec);
00249
00250 explicit Vec4(float f1) {
00251 set1(f1);
00252 }
00253
00254 Vec4(float f1, float f2, float f3, float f4=0) {
00255 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00256 }
00257
00258
00259 Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
00260 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00261 arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00262 }
00263
00264 Vec4( Vec2<float> ivec0, float f3, float f4=0) {
00265 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00266 arr[2] = f3; arr[3] = f4;
00267 }
00268
00269 Vec4( Vec2<float> ivec0) {
00270 vec = _mm_setzero_ps();
00271 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00272 }
00273
00274
00275
00276 void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00277 Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
00278 }
00279
00280 void set(float f1, float f2, float f3, float f4=0) {
00281 vec = _mm_set_ps(f4, f3, f2, f1);
00282 }
00283
00284 void set1(float f1) {
00285 vec = _mm_set1_ps(f1);
00286 }
00287
00288 Vec4 get1(unsigned int n) const {
00289 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(n, n, n, n));
00290 }
00291
00292 float & operator[](unsigned int n) {
00293 return arr[n];
00294 }
00295
00296 float operator[](unsigned int n) const {
00297 return arr[n];
00298 }
00299
00300 Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
00301 Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
00302
00303 };
00304
00305
00306 template<>
00307 union Vec2<double> {
00308 typedef __m128d nativeType;
00309 __m128d vec;
00310 double __attribute__ ((aligned(16))) arr[2];
00311
00312 Vec2(__m128d ivec) : vec(ivec) {}
00313
00314 Vec2() {
00315 vec = _mm_setzero_pd();
00316 }
00317
00318
00319 inline Vec2(Vec2<float> ivec);
00320
00321
00322 Vec2(double f1, double f2) {
00323 arr[0] = f1; arr[1] = f2;
00324 }
00325
00326 explicit Vec2(double f1) {
00327 set1(f1);
00328 }
00329
00330 inline Vec2(Vec4<double> v4);
00331
00332
00333 void setMask(unsigned long long m1, unsigned long long m2) {
00334 Mask2<double> mask(m1,m2); vec=mask.vec;
00335 }
00336
00337
00338 void set(double f1, double f2) {
00339 arr[0] = f1; arr[1] = f2;
00340 }
00341
00342 void set1(double f1) {
00343 vec = _mm_set1_pd(f1);
00344 }
00345
00346 Vec2 get1(unsigned int n) const {
00347 return Vec2(arr[n],arr[n]);
00348 }
00349
00350 double operator[](unsigned int n) const {
00351 return arr[n];
00352 }
00353 };
00354
00355
00356 template<>
00357 union Vec4<double> {
00358 __m128d vec[2];
00359 double __attribute__ ((aligned(16))) arr[4];
00360 OldVec<double> o;
00361
00362 Vec4(__m128d ivec[]) {
00363 vec[0] = ivec[0];
00364 vec[1] = ivec[1];
00365 }
00366
00367 Vec4(__m128d ivec0, __m128d ivec1) {
00368 vec[0] = ivec0;
00369 vec[1] = ivec1;
00370 }
00371
00372 Vec4() {
00373 vec[0] = _mm_setzero_pd();
00374 vec[1] = _mm_setzero_pd();
00375 }
00376
00377 Vec4( Vec4<float> ivec) {
00378 vec[0] = _mm_cvtps_pd(ivec.vec);
00379 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00380 }
00381
00382 explicit Vec4(double f1) {
00383 set1(f1);
00384 }
00385
00386 Vec4(double f1, double f2, double f3, double f4=0) {
00387 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00388 }
00389
00390 Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
00391 vec[0] = ivec0.vec;
00392 vec[1] = ivec1.vec;
00393 }
00394
00395 Vec4( Vec2<double> ivec0, double f3, double f4=0) {
00396 vec[0] = ivec0.vec;
00397 arr[2] = f3; arr[3] = f4;
00398 }
00399
00400 Vec4( Vec2<double> ivec0) {
00401 vec[0] = ivec0.vec;
00402 vec[1] = _mm_setzero_pd();
00403 }
00404
00405
00406 Vec4(OldVec<double> const & ivec) : o(ivec) {}
00407
00408
00409 void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00410 Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
00411 }
00412
00413
00414 void set(double f1, double f2, double f3, double f4=0) {
00415 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00416 }
00417
00418 void set1(double f1) {
00419 vec[0] = vec[1]= _mm_set1_pd(f1);
00420 }
00421
00422
00423 Vec4 get1(unsigned int n) const {
00424 return Vec4(arr[n],arr[n],arr[n],arr[n]);
00425 }
00426
00427 double & operator[](unsigned int n) {
00428 return arr[n];
00429 }
00430
00431 double operator[](unsigned int n) const {
00432 return arr[n];
00433 }
00434
00435 Vec2<double> xy() const { return vec[0];}
00436 Vec2<double> zw() const { return vec[1];}
00437
00438 };
00439
00440
00441 inline Vec4<float>::Vec4(Vec4<double> ivec) {
00442 vec = _mm_cvtpd_ps(ivec.vec[0]);
00443 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00444 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00445 }
00446
00447
00448 #endif // CMS_USE_SSE
00449
00450 typedef Vec2<float> Vec2F;
00451 typedef Vec4<float> Vec4F;
00452 typedef Vec4<float> Vec3F;
00453 typedef Vec2<double> Vec2D;
00454 typedef Vec4<double> Vec3D;
00455 typedef Vec4<double> Vec4D;
00456
00457 template<typename T>
00458 struct As3D {
00459 Vec4<T> const & v;
00460 As3D(Vec4<T> const &iv ) : v(iv){}
00461 };
00462
00463 template<typename T>
00464 inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00465
00466 }
00467
00468 #ifdef CMS_USE_SSE
00469
00470
00471
00472
00473 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00474 using mathSSE::_mm_dot_ps;
00475 float s;
00476 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00477 return s;
00478 }
00479
00480 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00481 using mathSSE::_mm_cross_ps;
00482 return _mm_cross_ps(a.vec,b.vec);
00483 }
00484
00485
00486 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00487 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00488 }
00489
00490 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00491 return _mm_cmpeq_ps(a.vec,b.vec);
00492 }
00493
00494 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00495 return _mm_cmpgt_ps(a.vec,b.vec);
00496 }
00497
00498 #ifdef __SSE3__
00499 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00500 return _mm_hadd_ps(a.vec,b.vec);
00501 }
00502 #endif
00503
00504
00505 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00506 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00507 return _mm_xor_ps(a.vec,neg);
00508 }
00509
00510 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00511 return _mm_and_ps(a.vec,b.vec);
00512 }
00513 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00514 return _mm_or_ps(a.vec,b.vec);
00515 }
00516 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00517 return _mm_xor_ps(a.vec,b.vec);
00518 }
00519 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00520 return _mm_andnot_ps(a.vec,b.vec);
00521 }
00522
00523
00524 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00525 return _mm_add_ps(a.vec,b.vec);
00526 }
00527
00528 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00529 return _mm_sub_ps(a.vec,b.vec);
00530 }
00531
00532 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00533 return _mm_mul_ps(a.vec,b.vec);
00534 }
00535
00536 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00537 return _mm_div_ps(a.vec,b.vec);
00538 }
00539
00540 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00541 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00542 }
00543
00544 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00545 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00546 }
00547
00548
00549
00550
00551 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00552 return -mathSSE::Vec4F(a);
00553 }
00554
00555
00556 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00557 return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00558 }
00559
00560 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00561 return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00562 }
00563
00564 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00565 return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00566 }
00567
00568 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00569 return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00570 }
00571
00572
00573 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00574 return s*mathSSE::Vec4F(a);
00575 }
00576
00577 inline mathSSE::Vec4F operator*(float s,mathSSE::Vec2F a) {
00578 return s*mathSSE::Vec4F(a);
00579 }
00580
00581
00582 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00583
00584 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00585 return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00586 }
00587
00588 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00589
00590 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00591 return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00592 }
00593
00594
00596
00597
00598 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
00599 vec = v4.vec[0];
00600 }
00601
00602 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
00603 arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00604 }
00605
00606 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00607 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00608 return _mm_xor_pd(a.vec,neg);
00609 }
00610
00611
00612 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00613 return _mm_and_pd(a.vec,b.vec);
00614 }
00615 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00616 return _mm_or_pd(a.vec,b.vec);
00617 }
00618 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00619 return _mm_xor_pd(a.vec,b.vec);
00620 }
00621 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00622 return _mm_andnot_pd(a.vec,b.vec);
00623 }
00624
00625
00626 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00627 return _mm_add_pd(a.vec,b.vec);
00628 }
00629
00630 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00631 return _mm_sub_pd(a.vec,b.vec);
00632 }
00633
00634 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00635 return _mm_mul_pd(a.vec,b.vec);
00636 }
00637
00638 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00639 return _mm_div_pd(a.vec,b.vec);
00640 }
00641
00642 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00643 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00644 }
00645
00646 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00647 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00648 }
00649
00650 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00651
00652 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00653 __m128d res = _mm_mul_pd ( a.vec, b.vec);
00654 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
00655 double s;
00656 _mm_store_sd(&s,res);
00657 return s;
00658 }
00659
00660 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00661
00662 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00663 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
00664 res = _mm_mul_pd ( a.vec , res );
00665 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00666 double s;
00667 _mm_store_sd(&s,res);
00668 return s;
00669 }
00670
00671
00672
00673
00674 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00675 return
00676 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
00677 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00678 }
00679
00680 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00681 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00682 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00683 }
00684
00685
00686 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00687 return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00688 }
00689
00690
00691 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00692 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00693 }
00694 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00695 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00696 }
00697 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00698 return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00699 }
00700
00701 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00702 __m128d res = _mm_set1_pd(a);
00703 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00704 }
00705
00706 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00707 __m128d res = _mm_set1_pd(a);
00708 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00709 }
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00728
00729 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00730 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00731 _mm_mul_sd ( a.vec[1], b.vec[1])
00732 );
00733 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00734 double s;
00735 _mm_store_sd(&s,res);
00736 return s;
00737 }
00738
00739 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00740
00741 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00742 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00743
00744 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00745
00746 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
00747 __m128d m1 = _mm_sub_pd ( l1 , l2 );
00748 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
00749 m1 = _mm_xor_pd ( m1 , neg );
00750
00751 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
00752
00753 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00754
00755 return mathSSE::Vec4D( m1 , m2 );
00756 }
00757
00758
00759
00760
00761 namespace mathSSE {
00762 template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00763 template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00764 template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00765 template<> inline Vec4D sqrt(Vec4D v) {
00766 return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00767 }
00768 }
00769
00770
00771 #include "DataFormats/Math/interface/sse_mathfun.h"
00772 namespace mathSSE {
00773 inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00774 inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00775 inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00776 inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00777 inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00778
00779 inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00780 inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
00781 inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
00782 inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00783 inline void sincos(float f, float & s, float & c) {
00784 __m128 vs, vc;
00785 sincos_ps(_mm_load_ss(&f),&vs, &vc);
00786 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
00787 }
00788 }
00789 #endif // CMS_USE_SSE
00790
00791
00792 #include <iosfwd>
00793 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
00794 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
00795 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
00796 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
00797
00798 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
00799 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
00800
00801
00802 #endif // DataFormat_Math_SSEVec_H