00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007 #ifdef __AVX__
00008 #define CMS_USE_AVX
00009 #endif
00010 #else
00011
00012 #ifdef __SSE2__
00013 #define CMS_USE_SSE
00014
00015 #include <mmintrin.h>
00016 #include <emmintrin.h>
00017 #endif
00018 #ifdef __SSE3__
00019 #include <pmmintrin.h>
00020 #endif
00021 #ifdef __SSE4_1__
00022 #include <smmintrin.h>
00023 #endif
00024
00025 #endif
00026
00027 #include<cmath>
00028
00029
00030 namespace mathSSE {
00031 struct ZeroUpper {
00032 ZeroUpper() {
00033 #ifdef __AVX__
00034 _mm256_zeroupper();
00035 #endif
00036 }
00037 ~ZeroUpper() {
00038 #ifdef __AVX__
00039 _mm256_zeroupper();
00040 #endif
00041 }
00042 };
00043 }
00044
00045 namespace mathSSE {
00046 template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00047 }
00048
00049 namespace mathSSE {
00050
00051 template<typename T> inline bool samesign(T rh, T lh);
00052
00053 template<>
00054 inline bool
00055 __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00056 int const mask= 0x80000000;
00057 return ((rh^lh)&mask) == 0;
00058 }
00059
00060 template<>
00061 inline bool
00062 __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00063 long long const mask= 0x8000000000000000LL;
00064 return ((rh^lh)&mask) == 0;
00065 }
00066
00067 template<>
00068 inline bool
00069 __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00070 union { int i; float f; } a, b;
00071 a.f=rh; b.f=lh;
00072 return samesign<int>(a.i,b.i);
00073 }
00074
00075 template<>
00076 inline bool
00077 __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00078 union { long long i; double f; } a, b;
00079 a.f=rh; b.f=lh;
00080 return samesign<long long>(a.i,b.i);
00081 }
00082 }
00083
00084
00085 namespace mathSSE {
00086 #ifdef CMS_USE_SSE
00087
00088 inline __m128
00089 __attribute__((always_inline)) __attribute__ ((pure))
00090 _mm_dot_ps(__m128 v1, __m128 v2) {
00091 #ifdef __SSE4_1__
00092 return _mm_dp_ps(v1, v2, 0xff);
00093 #else
00094 __m128 mul = _mm_mul_ps(v1, v2);
00095 #ifdef __SSE3__
00096 mul = _mm_hadd_ps(mul,mul);
00097 return _mm_hadd_ps(mul,mul);
00098 #else
00099 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00100 mul = _mm_add_ps(mul, swp);
00101 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00102 return _mm_add_ps(mul, swp);
00103 #endif
00104 #endif
00105 }
00106
00107
00108
00109 inline __m128
00110 __attribute__((always_inline)) __attribute__ ((pure))
00111 _mm_cross_ps(__m128 v1, __m128 v2) {
00112
00113
00114 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00115
00116 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00117
00118 __m128 v5 = _mm_mul_ps(v3, v4);
00119
00120
00121 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00122
00123 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00124
00125 v3 = _mm_mul_ps(v3, v4);
00126 const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00127 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00128 }
00129 #endif // CMS_USE_SSE
00130
00131 #ifdef CMS_USE_AVX
00132 inline __m256d
00133 __attribute__((always_inline)) __attribute__ ((pure))
00134 _mm256_dot_pd(__m256d v1, __m256d v2) {
00135 __m256d mul = _mm256_mul_pd(v1, v2);
00136 mul = _mm256_hadd_pd(mul,mul);
00137 __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
00138 return _mm256_add_pd(mul,tmp);
00139 }
00140
00141 inline __m256d
00142 __attribute__((always_inline)) __attribute__ ((pure))
00143 _mm256_cross_pd(__m256d v1, __m256d v2) {
00144
00145 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
00146 v3 = _mm256_permute_pd(v3,0);
00147
00148 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
00149 v4 = _mm256_permute_pd(v4,5);
00150
00151 __m256d v5 = _mm256_mul_pd(v3, v4);
00152
00153 v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
00154 v3 = _mm256_permute_pd(v3,0);
00155
00156 v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
00157 v4 = _mm256_permute_pd(v4,5);
00158
00159 v3 = _mm256_mul_pd(v3, v4);
00160 const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
00161 return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
00162
00163 }
00164
00165 #endif // CMS_USE_AVX
00166
00167
00168
00169 template<typename T>
00170 struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
00171 #ifdef CMS_USE_AVX
00172 template<>
00173 struct OldVec<double> { double theX; double theY; double theZ; double theW;} __attribute__ ((aligned (32)));
00174 #endif
00175
00176 template<typename T> union Vec4;
00177
00178 template<typename T> union Vec2{
00179 Vec2() {
00180 arr[0] = 0; arr[1] = 0;
00181 }
00182 Vec2(T f1, T f2) {
00183 arr[0] = f1; arr[1] = f2;
00184 }
00185 explicit Vec2(T f1) {
00186 arr[0] = f1; arr[1] = f1;
00187 }
00188
00189 void set(T f1, T f2) {
00190 arr[0] = f1; arr[1] = f2;
00191 }
00192
00193 template <int N>
00194 Vec2 get1() const {
00195 return Vec2(arr[N],arr[N]);
00196 }
00197
00198
00199
00200
00201
00202
00203
00204 template<typename U>
00205 Vec2(Vec2<U> v) {
00206 arr[0] = v[0]; arr[1] = v[1];
00207
00208 }
00209
00210 inline Vec2(Vec4<T> v4);
00211
00212 T & operator[](unsigned int n) {
00213 return arr[n];
00214 }
00215
00216 T operator[](unsigned int n) const {
00217 return arr[n];
00218 }
00219
00220
00221 T arr[2];
00222 };
00223
00224
00225 template<typename T> union Vec4{
00226 Vec4() {
00227 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00228 }
00229 Vec4(float f1, float f2, float f3, float f4=0) {
00230 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00231 }
00232 explicit Vec4(float f1) {
00233 set1(f1);
00234 }
00235 void set(float f1, float f2, float f3, float f4=0) {
00236 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00237 }
00238 void set1(float f1) {
00239 arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00240 }
00241 template <int N>
00242 Vec4 get1() const {
00243 return Vec4(arr[N],arr[N],arr[N],arr[N]);
00244 }
00245
00246
00247
00248
00249
00250
00251 Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
00252 Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
00253
00254
00255
00256 T __attribute__ ((aligned(16))) arr[4];
00257 OldVec<T> o;
00258 };
00259
00260 template<typename T>
00261 inline Vec2<T>::Vec2(Vec4<T> v4) {
00262 arr[0]=v4[0];arr[1]=v4[1];
00263 }
00264
00265
00266 #ifdef CMS_USE_SSE
00267
00268 template<>
00269 union Vec2<double>;
00270 template<>
00271 union Vec4<double>;
00272
00273 template<typename T> union Mask2{};
00274 template<typename T> union Mask4{};
00275
00276 template<>
00277 union Mask4<float> {
00278 __m128 vec;
00279 unsigned int __attribute__ ((aligned(16))) mask[4];
00280 Mask4() {vec = _mm_setzero_ps();}
00281 Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00282 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00283 }
00284 };
00285
00286 #ifdef CMS_USE_AVX
00287 template<>
00288 union Mask4<double> {
00289 __m256d vec;
00290 unsigned long long __attribute__ ((aligned(32))) mask[4];
00291 Mask4() {
00292 vec = _mm256_setzero_pd();
00293 }
00294 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00295 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00296 }
00297 };
00298 #else
00299 template<>
00300 union Mask4<double> {
00301 __m128d vec[2];
00302 unsigned long long __attribute__ ((aligned(16))) mask[4];
00303 Mask4() {
00304 vec[0] = _mm_setzero_pd();
00305 vec[1] = _mm_setzero_pd();
00306 }
00307 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00308 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00309 }
00310 };
00311 #endif
00312
00313 template<>
00314 union Mask2<double> {
00315 __m128d vec;
00316 unsigned long long __attribute__ ((aligned(16))) mask[2];
00317 Mask2() {
00318 vec = _mm_setzero_pd();
00319 }
00320 Mask2(unsigned long long m1, unsigned long long m2) {
00321 mask[0]=m1; mask[1]=m2;
00322 }
00323 };
00324
00325 template<>
00326 union Vec4<float> {
00327 typedef __m128 nativeType;
00328 __m128 vec;
00329 float __attribute__ ((aligned(16))) arr[4];
00330 OldVec<float> o;
00331
00332 Vec4(__m128 ivec) : vec(ivec) {}
00333
00334 Vec4(OldVec<float> const & ivec) : o(ivec) {}
00335
00336 Vec4() {
00337 vec = _mm_setzero_ps();
00338 }
00339
00340
00341 inline Vec4(Vec4<double> ivec);
00342
00343 explicit Vec4(float f1) {
00344 set1(f1);
00345 }
00346
00347 Vec4(float f1, float f2, float f3, float f4=0) {
00348 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00349 }
00350
00351
00352 Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
00353 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00354 arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00355 }
00356
00357 Vec4( Vec2<float> ivec0, float f3, float f4=0) {
00358 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00359 arr[2] = f3; arr[3] = f4;
00360 }
00361
00362 Vec4( Vec2<float> ivec0) {
00363 vec = _mm_setzero_ps();
00364 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00365 }
00366
00367
00368
00369 void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00370 Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
00371 }
00372
00373 void set(float f1, float f2, float f3, float f4=0) {
00374 vec = _mm_set_ps(f4, f3, f2, f1);
00375 }
00376
00377 void set1(float f1) {
00378 vec = _mm_set1_ps(f1);
00379 }
00380
00381 template <int N>
00382 Vec4 get1() const {
00383 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
00384 }
00385
00386 float & operator[](unsigned int n) {
00387 return arr[n];
00388 }
00389
00390 float operator[](unsigned int n) const {
00391 return arr[n];
00392 }
00393
00394 Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
00395 Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
00396
00397 };
00398
00399
00400 template<>
00401 union Vec2<double> {
00402 typedef __m128d nativeType;
00403 __m128d vec;
00404 double __attribute__ ((aligned(16))) arr[2];
00405
00406 Vec2(__m128d ivec) : vec(ivec) {}
00407
00408 Vec2() {
00409 vec = _mm_setzero_pd();
00410 }
00411
00412
00413 inline Vec2(Vec2<float> ivec);
00414
00415
00416 Vec2(double f1, double f2) {
00417 arr[0] = f1; arr[1] = f2;
00418 }
00419
00420 explicit Vec2(double f1) {
00421 set1(f1);
00422 }
00423
00424 inline Vec2(Vec4<double> v4);
00425
00426
00427 void setMask(unsigned long long m1, unsigned long long m2) {
00428 Mask2<double> mask(m1,m2); vec=mask.vec;
00429 }
00430
00431
00432 void set(double f1, double f2) {
00433 arr[0] = f1; arr[1] = f2;
00434 }
00435
00436 void set1(double f1) {
00437 vec = _mm_set1_pd(f1);
00438 }
00439
00440 template <int N>
00441 Vec2 get1() const {
00442 return Vec2(arr[N],arr[N]);
00443 }
00444
00445
00446
00447
00448
00449 double operator[](unsigned int n) const {
00450 return arr[n];
00451 }
00452 };
00453
00454
00455
00456
00457 #ifdef CMS_USE_AVX
00458 }
00459 #include "AVXVec.h"
00460
00461 namespace mathSSE {
00462 #else
00463 template<>
00464 union Vec4<double> {
00465 __m128d vec[2];
00466 double __attribute__ ((aligned(16))) arr[4];
00467 OldVec<double> o;
00468
00469 Vec4(__m128d ivec[]) {
00470 vec[0] = ivec[0];
00471 vec[1] = ivec[1];
00472 }
00473
00474 Vec4(__m128d ivec0, __m128d ivec1) {
00475 vec[0] = ivec0;
00476 vec[1] = ivec1;
00477 }
00478
00479 Vec4() {
00480 vec[0] = _mm_setzero_pd();
00481 vec[1] = _mm_setzero_pd();
00482 }
00483
00484 Vec4( Vec4<float> ivec) {
00485 vec[0] = _mm_cvtps_pd(ivec.vec);
00486 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00487 }
00488
00489 explicit Vec4(double f1) {
00490 set1(f1);
00491 }
00492
00493 Vec4(double f1, double f2, double f3, double f4=0) {
00494 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00495 }
00496
00497 Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
00498 vec[0] = ivec0.vec;
00499 vec[1] = ivec1.vec;
00500 }
00501
00502 Vec4( Vec2<double> ivec0, double f3, double f4=0) {
00503 vec[0] = ivec0.vec;
00504 arr[2] = f3; arr[3] = f4;
00505 }
00506
00507 Vec4( Vec2<double> ivec0) {
00508 vec[0] = ivec0.vec;
00509 vec[1] = _mm_setzero_pd();
00510 }
00511
00512
00513 Vec4(OldVec<double> const & ivec) : o(ivec) {}
00514
00515
00516 void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00517 Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
00518 }
00519
00520
00521 void set(double f1, double f2, double f3, double f4=0) {
00522 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00523 }
00524
00525 void set1(double f1) {
00526 vec[0] = vec[1]= _mm_set1_pd(f1);
00527 }
00528
00529
00530 template<int N>
00531 Vec4 get1() const {
00532 return Vec4(arr[N],arr[N],arr[N],arr[N]);
00533 }
00534
00535 double & operator[](unsigned int n) {
00536 return arr[n];
00537 }
00538
00539 double operator[](unsigned int n) const {
00540 return arr[n];
00541 }
00542
00543 Vec2<double> xy() const { return vec[0];}
00544 Vec2<double> zw() const { return vec[1];}
00545
00546 };
00547
00548
00549 inline Vec4<float>::Vec4(Vec4<double> ivec) {
00550 vec = _mm_cvtpd_ps(ivec.vec[0]);
00551 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00552 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00553 }
00554 #endif // CMS_USE_AVX
00555
00556
00557 #endif // CMS_USE_SSE
00558
00559 typedef Vec2<float> Vec2F;
00560 typedef Vec4<float> Vec4F;
00561 typedef Vec4<float> Vec3F;
00562 typedef Vec2<double> Vec2D;
00563 typedef Vec4<double> Vec3D;
00564 typedef Vec4<double> Vec4D;
00565
00566 template<typename T>
00567 struct As3D {
00568 Vec4<T> const & v;
00569 As3D(Vec4<T> const &iv ) : v(iv){}
00570 };
00571
00572 template<typename T>
00573 inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00574
00575 }
00576
00577 #ifdef CMS_USE_SSE
00578
00579
00580
00581
00582
00583 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00584 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00585 }
00586
00587 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00588 return _mm_cmpeq_ps(a.vec,b.vec);
00589 }
00590
00591 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00592 return _mm_cmpgt_ps(a.vec,b.vec);
00593 }
00594
00595 #ifdef __SSE3__
00596 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00597 return _mm_hadd_ps(a.vec,b.vec);
00598 }
00599 #endif
00600
00601
00602 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00603 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00604 return _mm_xor_ps(a.vec,neg);
00605 }
00606
00607 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00608 return _mm_and_ps(a.vec,b.vec);
00609 }
00610 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00611 return _mm_or_ps(a.vec,b.vec);
00612 }
00613 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00614 return _mm_xor_ps(a.vec,b.vec);
00615 }
00616 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00617 return _mm_andnot_ps(a.vec,b.vec);
00618 }
00619
00620
00621 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00622 return _mm_add_ps(a.vec,b.vec);
00623 }
00624
00625 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00626 return _mm_sub_ps(a.vec,b.vec);
00627 }
00628
00629 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00630 return _mm_mul_ps(a.vec,b.vec);
00631 }
00632
00633 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00634 return _mm_div_ps(a.vec,b.vec);
00635 }
00636
00637 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00638 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00639 }
00640
00641 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00642 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00643 }
00644
00645
00646 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00647 using mathSSE::_mm_dot_ps;
00648 float s;
00649 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00650 return s;
00651 }
00652
00653 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00654 using mathSSE::_mm_cross_ps;
00655 return _mm_cross_ps(a.vec,b.vec);
00656 }
00657
00658
00659 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00660 mathSSE::Vec4F mul = a*b;
00661 #ifdef __SSE3__
00662 mul = hadd(mul,mul);
00663 #else
00664 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
00665 mul.vec = _mm_add_ps(mul.vec, swp);
00666 #endif
00667 float s;
00668 _mm_store_ss(&s,mul.vec);
00669 return s;
00670 }
00671
00672
00673
00674
00675
00676 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00677 return -mathSSE::Vec4F(a);
00678 }
00679
00680
00681 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00682 return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00683 }
00684
00685 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00686 return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00687 }
00688
00689 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00690 return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00691 }
00692
00693 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00694 return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00695 }
00696
00697
00698 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00699 return s*mathSSE::Vec4F(a);
00700 }
00701
00702 inline mathSSE::Vec4F operator*(float s,mathSSE::Vec2F a) {
00703 return s*mathSSE::Vec4F(a);
00704 }
00705
00706
00707 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00708
00709 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00710 return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00711 }
00712
00713 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00714
00715 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00716 return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00717 }
00718
00719
00721
00722
00723
00724 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
00725 #ifdef CMS_USE_AVX
00726 vec = _mm256_castpd256_pd128(v4.vec);
00727 #else
00728 vec = v4.vec[0];
00729 #endif
00730 }
00731
00732 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
00733 arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00734 }
00735
00736 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00737 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00738 return _mm_xor_pd(a.vec,neg);
00739 }
00740
00741
00742 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00743 return _mm_and_pd(a.vec,b.vec);
00744 }
00745 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00746 return _mm_or_pd(a.vec,b.vec);
00747 }
00748 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00749 return _mm_xor_pd(a.vec,b.vec);
00750 }
00751 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00752 return _mm_andnot_pd(a.vec,b.vec);
00753 }
00754
00755 #ifdef __SSE3__
00756 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00757 return _mm_hadd_pd(a.vec,b.vec);
00758 }
00759 #endif
00760
00761 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00762 return _mm_add_pd(a.vec,b.vec);
00763 }
00764
00765 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00766 return _mm_sub_pd(a.vec,b.vec);
00767 }
00768
00769 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00770 return _mm_mul_pd(a.vec,b.vec);
00771 }
00772
00773 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00774 return _mm_div_pd(a.vec,b.vec);
00775 }
00776
00777 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00778 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00779 }
00780
00781 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00782 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00783 }
00784
00785 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00786
00787 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00788 __m128d res = _mm_mul_pd ( a.vec, b.vec);
00789 #ifdef __SSE3__
00790 res = _mm_hadd_pd(res,res);
00791 #else
00792 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
00793 #endif
00794 double s;
00795 _mm_store_sd(&s,res);
00796 return s;
00797 }
00798
00799 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00800
00801 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00802 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
00803 res = _mm_mul_pd ( a.vec , res );
00804 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00805 double s;
00806 _mm_store_sd(&s,res);
00807 return s;
00808 }
00809
00810
00811 #ifndef CMS_USE_AVX
00812
00813
00814
00815 #ifdef __SSE3__
00816
00817 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00818 return mathSSE::Vec4D(hadd(a.vec[0],b.vec[0]),hadd(a.vec[1],b.vec[1]) );
00819 }
00820 #endif
00821
00822
00823 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00824 return
00825 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
00826 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00827 }
00828
00829 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00830 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00831 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00832 }
00833
00834
00835 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00836 return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00837 }
00838
00839
00840 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00841 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00842 }
00843 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00844 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00845 }
00846 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00847 return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00848 }
00849
00850 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00851 __m128d res = _mm_set1_pd(a);
00852 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00853 }
00854
00855 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00856 __m128d res = _mm_set1_pd(a);
00857 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00858 }
00859
00860
00861
00862
00863
00864
00865
00866
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00877
00878 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00879 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00880 _mm_mul_sd ( a.vec[1], b.vec[1])
00881 );
00882 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00883 double s;
00884 _mm_store_sd(&s,res);
00885 return s;
00886 }
00887
00888 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00889
00890 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00891 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00892
00893 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00894
00895 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
00896 __m128d m1 = _mm_sub_pd ( l1 , l2 );
00897 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
00898 m1 = _mm_xor_pd ( m1 , neg );
00899
00900 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
00901
00902 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00903
00904 return mathSSE::Vec4D( m1 , m2 );
00905 }
00906
00907
00908 inline double
00909 __attribute__((always_inline)) __attribute__ ((pure))
00910 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00911 return dot(a.xy(),b.xy());
00912 }
00913
00914 #endif // CMS_USE_AVX
00915
00916
00917
00918 namespace mathSSE {
00919 template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00920 template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00921 template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00922 #ifdef CMS_USE_AVX
00923 template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
00924 #else
00925 template<> inline Vec4D sqrt(Vec4D v) {
00926 return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00927 }
00928 #endif
00929 }
00930
00931
00932 #include "DataFormats/Math/interface/sse_mathfun.h"
00933 namespace mathSSE {
00934 inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00935 inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00936 inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00937 inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00938 inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00939
00940 inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00941 inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
00942 inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
00943 inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00944 inline void sincos(float f, float & s, float & c) {
00945 __m128 vs, vc;
00946 sincos_ps(_mm_load_ss(&f),&vs, &vc);
00947 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
00948 }
00949 }
00950 #endif // CMS_USE_SSE
00951
00952
00953 #include <iosfwd>
00954 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
00955 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
00956 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
00957 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
00958
00959 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
00960 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
00961
00962
00963 #endif // DataFormat_Math_SSEVec_H