00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007 #ifdef __AVX__
00008 #define CMS_USE_AVX
00009 #endif
00010 #else
00011
00012 #ifdef __SSE2__
00013 #define CMS_USE_SSE
00014
00015 #include <mmintrin.h>
00016 #include <emmintrin.h>
00017 #endif
00018 #ifdef __SSE3__
00019 #include <pmmintrin.h>
00020 #endif
00021 #ifdef __SSE4_1__
00022 #include <smmintrin.h>
00023 #endif
00024
00025 #endif
00026
00027 #include<cmath>
00028
00029
00030 namespace mathSSE {
00031 struct ZeroUpper {
00032 ZeroUpper() {
00033 #ifdef __AVX__
00034 _mm256_zeroupper();
00035 #endif
00036 }
00037 ~ZeroUpper() {
00038 #ifdef __AVX__
00039 _mm256_zeroupper();
00040 #endif
00041 }
00042 };
00043 }
00044
00045 namespace mathSSE {
00046 template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00047 }
00048
00049 namespace mathSSE {
00050
00051 template<typename T> inline bool samesign(T rh, T lh);
00052
00053 template<>
00054 inline bool
00055 __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00056 int const mask= 0x80000000;
00057 return ((rh^lh)&mask) == 0;
00058 }
00059
00060 template<>
00061 inline bool
00062 __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00063 long long const mask= 0x8000000000000000LL;
00064 return ((rh^lh)&mask) == 0;
00065 }
00066
00067 template<>
00068 inline bool
00069 __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00070 union { int i; float f; } a, b;
00071 a.f=rh; b.f=lh;
00072 return samesign<int>(a.i,b.i);
00073 }
00074
00075 template<>
00076 inline bool
00077 __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00078 union { long long i; double f; } a, b;
00079 a.f=rh; b.f=lh;
00080 return samesign<long long>(a.i,b.i);
00081 }
00082 }
00083
00084
00085 namespace mathSSE {
00086 #ifdef CMS_USE_SSE
00087
00088 inline __m128
00089 __attribute__((always_inline)) __attribute__ ((pure))
00090 _mm_dot_ps(__m128 v1, __m128 v2) {
00091 #ifdef __SSE4_1__
00092 return _mm_dp_ps(v1, v2, 0xff);
00093 #else
00094 __m128 mul = _mm_mul_ps(v1, v2);
00095 #ifdef __SSE3__
00096 mul = _mm_hadd_ps(mul,mul);
00097 return _mm_hadd_ps(mul,mul);
00098 #else
00099 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00100 mul = _mm_add_ps(mul, swp);
00101 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00102 return _mm_add_ps(mul, swp);
00103 #endif
00104 #endif
00105 }
00106
00107
00108
00109 inline __m128
00110 __attribute__((always_inline)) __attribute__ ((pure))
00111 _mm_cross_ps(__m128 v1, __m128 v2) {
00112
00113
00114 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00115
00116 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00117
00118 __m128 v5 = _mm_mul_ps(v3, v4);
00119
00120
00121 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00122
00123 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00124
00125 v3 = _mm_mul_ps(v3, v4);
00126 const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00127 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00128 }
00129 #endif // CMS_USE_SSE
00130
00131 #ifdef CMS_USE_AVX
00132 inline __m256d
00133 __attribute__((always_inline)) __attribute__ ((pure))
00134 _mm256_dot_pd(__m256d v1, __m256d v2) {
00135 __m256d mul = _mm256_mul_pd(v1, v2);
00136 mul = _mm256_hadd_pd(mul,mul);
00137 __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
00138 return _mm256_add_pd(mul,tmp);
00139 }
00140
00141 inline __m256d
00142 __attribute__((always_inline)) __attribute__ ((pure))
00143 _mm256_cross_pd(__m256d v1, __m256d v2) {
00144
00145 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
00146 v3 = _mm256_permute_pd(v3,0);
00147
00148 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
00149 v4 = _mm256_permute_pd(v4,5);
00150
00151 __m256d v5 = _mm256_mul_pd(v3, v4);
00152
00153 v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
00154 v3 = _mm256_permute_pd(v3,0);
00155
00156 v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
00157 v4 = _mm256_permute_pd(v4,5);
00158
00159 v3 = _mm256_mul_pd(v3, v4);
00160 const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
00161 return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
00162
00163 }
00164
00165 #endif // CMS_USE_AVX
00166
00167
00168
00169 template<typename T>
00170 struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
00171 #ifdef CMS_USE_AVX
00172 template<>
00173 struct OldVec<double> { double theX; double theY; double theZ; double theW;} __attribute__ ((aligned (32)));
00174 #endif
00175
00176 template<typename T> union Vec4;
00177
00178 template<typename T> union Vec2{
00179 Vec2() {
00180 arr[0] = 0; arr[1] = 0;
00181 }
00182 Vec2(T f1, T f2) {
00183 arr[0] = f1; arr[1] = f2;
00184 }
00185 explicit Vec2(T f1) {
00186 arr[0] = f1; arr[1] = f1;
00187 }
00188
00189 void set(T f1, T f2) {
00190 arr[0] = f1; arr[1] = f2;
00191 }
00192
00193 template <int N>
00194 Vec2 get1() const {
00195 return Vec2(arr[N],arr[N]);
00196 }
00197
00198
00199
00200
00201
00202
00203
00204 template<typename U>
00205 Vec2(Vec2<U> v) {
00206 arr[0] = v[0]; arr[1] = v[1];
00207
00208 }
00209
00210 inline Vec2(Vec4<T> v4);
00211
00212 T & operator[](unsigned int n) {
00213 return arr[n];
00214 }
00215
00216 T operator[](unsigned int n) const {
00217 return arr[n];
00218 }
00219
00220
00221 T arr[2];
00222 };
00223
00224
00225 template<typename T> union Vec4{
00226 Vec4() {
00227 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00228 }
00229 Vec4(float f1, float f2, float f3, float f4=0) {
00230 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00231 }
00232 explicit Vec4(float f1) {
00233 set1(f1);
00234 }
00235 void set(float f1, float f2, float f3, float f4=0) {
00236 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00237 }
00238 void set1(float f1) {
00239 arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00240 }
00241 template <int N>
00242 Vec4 get1() const {
00243 return Vec4(arr[N],arr[N],arr[N],arr[N]);
00244 }
00245
00246
00247
00248
00249
00250
00251 Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
00252 Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
00253
00254
00255
00256 T __attribute__ ((aligned(16))) arr[4];
00257 OldVec<T> o;
00258 };
00259
00260 template<typename T>
00261 inline Vec2<T>::Vec2(Vec4<T> v4) {
00262 arr[0]=v4[0];arr[1]=v4[1];
00263 }
00264
00265
00266 #ifdef CMS_USE_SSE
00267
00268 template<>
00269 union Vec2<double>;
00270 template<>
00271 union Vec4<double>;
00272
00273 template<typename T> union Mask2{};
00274 template<typename T> union Mask4{};
00275
00276 template<>
00277 union Mask4<float> {
00278 __m128 vec;
00279 unsigned int __attribute__ ((aligned(16))) mask[4];
00280 Mask4() {vec = _mm_setzero_ps();}
00281 Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00282 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00283 }
00284 };
00285
00286 #ifdef CMS_USE_AVX
00287 template<>
00288 union Mask4<double> {
00289 __m256d vec;
00290 unsigned long long __attribute__ ((aligned(32))) mask[4];
00291 Mask4() {
00292 vec = _mm256_setzero_pd();
00293 }
00294 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00295 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00296 }
00297 };
00298 #else
00299 template<>
00300 union Mask4<double> {
00301 __m128d vec[2];
00302 unsigned long long __attribute__ ((aligned(16))) mask[4];
00303 Mask4() {
00304 vec[0] = _mm_setzero_pd();
00305 vec[1] = _mm_setzero_pd();
00306 }
00307 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00308 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
00309 }
00310 };
00311 #endif
00312
00313 template<>
00314 union Mask2<double> {
00315 __m128d vec;
00316 unsigned long long __attribute__ ((aligned(16))) mask[2];
00317 Mask2() {
00318 vec = _mm_setzero_pd();
00319 }
00320 Mask2(unsigned long long m1, unsigned long long m2) {
00321 mask[0]=m1; mask[1]=m2;
00322 }
00323 };
00324
00325 template<>
00326 union Vec4<float> {
00327 typedef __m128 nativeType;
00328 __m128 vec;
00329 float __attribute__ ((aligned(16))) arr[4];
00330 OldVec<float> o;
00331
00332 Vec4(__m128 ivec) : vec(ivec) {}
00333
00334 Vec4(OldVec<float> const & ivec) : o(ivec) {}
00335
00336 Vec4() {
00337 vec = _mm_setzero_ps();
00338 }
00339
00340
00341 inline Vec4(Vec4<double> ivec);
00342
00343 explicit Vec4(float f1) {
00344 set1(f1);
00345 }
00346
00347 Vec4(float f1, float f2, float f3, float f4=0) {
00348 vec = _mm_set_ps(f4, f3, f2, f1);
00349 }
00350
00351
00352 Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
00353 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00354 arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00355 }
00356
00357 Vec4( Vec2<float> ivec0, float f3, float f4=0) {
00358 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00359 arr[2] = f3; arr[3] = f4;
00360 }
00361
00362 Vec4( Vec2<float> ivec0) {
00363 vec = _mm_setzero_ps();
00364 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00365 }
00366
00367
00368
00369 void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00370 Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
00371 }
00372
00373 void set(float f1, float f2, float f3, float f4=0) {
00374 vec = _mm_set_ps(f4, f3, f2, f1);
00375 }
00376
00377 void set1(float f1) {
00378 vec = _mm_set1_ps(f1);
00379 }
00380
00381 template <int N>
00382 Vec4 get1() const {
00383 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
00384 }
00385
00386 float & operator[](unsigned int n) {
00387 return arr[n];
00388 }
00389
00390 float operator[](unsigned int n) const {
00391 return arr[n];
00392 }
00393
00394 Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
00395 Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
00396
00397 };
00398
00399
00400 template<>
00401 union Vec2<double> {
00402 typedef __m128d nativeType;
00403 __m128d vec;
00404 double __attribute__ ((aligned(16))) arr[2];
00405
00406 Vec2(__m128d ivec) : vec(ivec) {}
00407
00408 Vec2() {
00409 vec = _mm_setzero_pd();
00410 }
00411
00412
00413 inline Vec2(Vec2<float> ivec);
00414
00415
00416 Vec2(double f1, double f2) {
00417 vec = _mm_set_pd(f2,f1);
00418 }
00419
00420 explicit Vec2(double f1) {
00421 set1(f1);
00422 }
00423
00424 inline Vec2(Vec4<double> v4);
00425
00426
00427 void setMask(unsigned long long m1, unsigned long long m2) {
00428 Mask2<double> mask(m1,m2); vec=mask.vec;
00429 }
00430
00431
00432 void set(double f1, double f2) {
00433 vec = _mm_set_pd(f2,f1);
00434 }
00435
00436 void set1(double f1) {
00437 vec = _mm_set1_pd(f1);
00438 }
00439
00440 template <int N>
00441 Vec2 get1() const {
00442 return Vec2(arr[N],arr[N]);
00443 }
00444
00445
00446
00447
00448
00449 double & operator[](unsigned int n) {
00450 return arr[n];
00451 }
00452 double operator[](unsigned int n) const {
00453 return arr[n];
00454 }
00455 };
00456
00457
00458
00459
00460 #ifdef CMS_USE_AVX
00461 }
00462 #include "AVXVec.h"
00463
00464 namespace mathSSE {
00465 #else
00466 template<>
00467 union Vec4<double> {
00468 __m128d vec[2];
00469 double __attribute__ ((aligned(16))) arr[4];
00470 OldVec<double> o;
00471
00472 Vec4(__m128d ivec[]) {
00473 vec[0] = ivec[0];
00474 vec[1] = ivec[1];
00475 }
00476
00477 Vec4(__m128d ivec0, __m128d ivec1) {
00478 vec[0] = ivec0;
00479 vec[1] = ivec1;
00480 }
00481
00482 Vec4() {
00483 vec[0] = _mm_setzero_pd();
00484 vec[1] = _mm_setzero_pd();
00485 }
00486
00487 Vec4( Vec4<float> ivec) {
00488 vec[0] = _mm_cvtps_pd(ivec.vec);
00489 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00490 }
00491
00492 explicit Vec4(double f1) {
00493 set1(f1);
00494 }
00495
00496 Vec4(double f1, double f2, double f3, double f4=0) {
00497 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00498 }
00499
00500 Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
00501 vec[0] = ivec0.vec;
00502 vec[1] = ivec1.vec;
00503 }
00504
00505 Vec4( Vec2<double> ivec0, double f3, double f4=0) {
00506 vec[0] = ivec0.vec;
00507 arr[2] = f3; arr[3] = f4;
00508 }
00509
00510 Vec4( Vec2<double> ivec0) {
00511 vec[0] = ivec0.vec;
00512 vec[1] = _mm_setzero_pd();
00513 }
00514
00515
00516 Vec4(OldVec<double> const & ivec) : o(ivec) {}
00517
00518
00519 void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00520 Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
00521 }
00522
00523
00524 void set(double f1, double f2, double f3, double f4=0) {
00525 arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00526 }
00527
00528 void set1(double f1) {
00529 vec[0] = vec[1]= _mm_set1_pd(f1);
00530 }
00531
00532
00533 template<int N>
00534 Vec4 get1() const {
00535 return Vec4(arr[N],arr[N],arr[N],arr[N]);
00536 }
00537
00538 double & operator[](unsigned int n) {
00539 return arr[n];
00540 }
00541
00542 double operator[](unsigned int n) const {
00543 return arr[n];
00544 }
00545
00546 Vec2<double> xy() const { return vec[0];}
00547 Vec2<double> zw() const { return vec[1];}
00548
00549 };
00550
00551
00552 inline Vec4<float>::Vec4(Vec4<double> ivec) {
00553 vec = _mm_cvtpd_ps(ivec.vec[0]);
00554 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00555 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00556 }
00557 #endif // CMS_USE_AVX
00558
00559
00560 #endif // CMS_USE_SSE
00561
00562 typedef Vec2<float> Vec2F;
00563 typedef Vec4<float> Vec4F;
00564 typedef Vec4<float> Vec3F;
00565 typedef Vec2<double> Vec2D;
00566 typedef Vec4<double> Vec3D;
00567 typedef Vec4<double> Vec4D;
00568
00569 template<typename T>
00570 struct As3D {
00571 Vec4<T> const & v;
00572 As3D(Vec4<T> const &iv ) : v(iv){}
00573 };
00574
00575 template<typename T>
00576 inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00577
00578 }
00579
00580 #ifdef CMS_USE_SSE
00581
00582
00583
00584
00585
00586 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00587 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00588 }
00589
00590 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00591 return _mm_cmpeq_ps(a.vec,b.vec);
00592 }
00593
00594 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00595 return _mm_cmpgt_ps(a.vec,b.vec);
00596 }
00597
00598 #ifdef __SSE3__
00599 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00600 return _mm_hadd_ps(a.vec,b.vec);
00601 }
00602 #endif
00603
00604
00605 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00606 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00607 return _mm_xor_ps(a.vec,neg);
00608 }
00609
00610 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00611 return _mm_and_ps(a.vec,b.vec);
00612 }
00613 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00614 return _mm_or_ps(a.vec,b.vec);
00615 }
00616 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00617 return _mm_xor_ps(a.vec,b.vec);
00618 }
00619 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00620 return _mm_andnot_ps(a.vec,b.vec);
00621 }
00622
00623
00624 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00625 return _mm_add_ps(a.vec,b.vec);
00626 }
00627
00628 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00629 return _mm_sub_ps(a.vec,b.vec);
00630 }
00631
00632 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00633 return _mm_mul_ps(a.vec,b.vec);
00634 }
00635
00636 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00637 return _mm_div_ps(a.vec,b.vec);
00638 }
00639
00640 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00641 return _mm_min_ps(a.vec,b.vec);
00642 }
00643
00644 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00645 return _mm_max_ps(a.vec,b.vec);
00646 }
00647
00648
00649 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00650 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00651 }
00652
00653 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00654 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
00655 }
00656
00657 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b,float a) {
00658 return _mm_div_ps(b.vec,_mm_set1_ps(a));
00659 }
00660
00661
00662 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00663 using mathSSE::_mm_dot_ps;
00664 float s;
00665 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00666 return s;
00667 }
00668
00669 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00670 using mathSSE::_mm_cross_ps;
00671 return _mm_cross_ps(a.vec,b.vec);
00672 }
00673
00674
00675 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00676 mathSSE::Vec4F mul = a*b;
00677 #ifdef __SSE3__
00678 mul = hadd(mul,mul);
00679 #else
00680 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
00681 mul.vec = _mm_add_ps(mul.vec, swp);
00682 #endif
00683 float s;
00684 _mm_store_ss(&s,mul.vec);
00685 return s;
00686 }
00687
00688
00689
00690
00691
00692 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00693 return -mathSSE::Vec4F(a);
00694 }
00695
00696
00697 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00698 return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00699 }
00700
00701 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00702 return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00703 }
00704
00705 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00706 return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00707 }
00708
00709 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00710 return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00711 }
00712
00713 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00714 return min(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
00715 }
00716
00717 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00718 return ::max(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
00719 }
00720
00721
00722 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00723 return s*mathSSE::Vec4F(a);
00724 }
00725
00726 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) {
00727 return s*mathSSE::Vec4F(a);
00728 }
00729
00730 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) {
00731 return mathSSE::Vec4F(a)/s;
00732 }
00733
00734
00735 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00736
00737 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00738 return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00739 }
00740
00741 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
00742
00743 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00744 return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00745 }
00746
00747
00749
00750
00751
00752 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
00753 #ifdef CMS_USE_AVX
00754 vec = _mm256_castpd256_pd128(v4.vec);
00755 #else
00756 vec = v4.vec[0];
00757 #endif
00758 }
00759
00760 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
00761 arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00762 }
00763
00764 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00765 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00766 return _mm_xor_pd(a.vec,neg);
00767 }
00768
00769
00770 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00771 return _mm_and_pd(a.vec,b.vec);
00772 }
00773 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00774 return _mm_or_pd(a.vec,b.vec);
00775 }
00776 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00777 return _mm_xor_pd(a.vec,b.vec);
00778 }
00779 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00780 return _mm_andnot_pd(a.vec,b.vec);
00781 }
00782
00783 #ifdef __SSE3__
00784 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00785 return _mm_hadd_pd(a.vec,b.vec);
00786 }
00787 #endif
00788
00789 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00790 return _mm_add_pd(a.vec,b.vec);
00791 }
00792
00793 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00794 return _mm_sub_pd(a.vec,b.vec);
00795 }
00796
00797 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00798 return _mm_mul_pd(a.vec,b.vec);
00799 }
00800
00801 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00802 return _mm_div_pd(a.vec,b.vec);
00803 }
00804
00805
00806 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00807 return _mm_min_pd(a.vec,b.vec);
00808 }
00809
00810 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00811 return _mm_max_pd(a.vec,b.vec);
00812 }
00813
00814
00815 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00816 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00817 }
00818
00819 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00820 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
00821 }
00822
00823 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b,double a) {
00824 return _mm_div_pd(b.vec,_mm_set1_pd(a));
00825 }
00826
00827
00828 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00829
00830 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00831 __m128d res = _mm_mul_pd ( a.vec, b.vec);
00832 #ifdef __SSE3__
00833 res = _mm_hadd_pd(res,res);
00834 #else
00835 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
00836 #endif
00837 double s;
00838 _mm_store_sd(&s,res);
00839 return s;
00840 }
00841
00842 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
00843
00844 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00845 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
00846 res = _mm_mul_pd ( a.vec , res );
00847 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00848 double s;
00849 _mm_store_sd(&s,res);
00850 return s;
00851 }
00852
00853
00854 #ifndef CMS_USE_AVX
00855
00856
00857
00858 #ifdef __SSE3__
00859
00860 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00861 return mathSSE::Vec4D(hadd(a.vec[0],b.vec[0]),hadd(a.vec[1],b.vec[1]) );
00862 }
00863 #endif
00864
00865
00866 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00867 return
00868 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
00869 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00870 }
00871
00872 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00873 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00874 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00875 }
00876
00877
00878 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00879 return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00880 }
00881
00882
00883 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00884 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00885 }
00886 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00887 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00888 }
00889 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00890 return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00891 }
00892
00893
00894 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00895 using namespace mathSSE;
00896 return mathSSE::Vec4D(::min(Vec2D(a.vec[0]),Vec2D(b.vec[0])),::min(Vec2D(a.vec[1]),Vec2D(b.vec[1])) );
00897 }
00898
00899 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00900 using namespace mathSSE;
00901 return mathSSE::Vec4D(::max(Vec2D(a.vec[0]),Vec2D(b.vec[0])),::max(Vec2D(a.vec[1]),Vec2D(b.vec[1])) );
00902 }
00903
00904 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00905 __m128d res = _mm_set1_pd(a);
00906 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00907 }
00908
00909 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00910 __m128d res = _mm_set1_pd(a);
00911 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00912 }
00913
00914 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
00915 a = 1./a;
00916 return a*b;
00917 }
00918
00919
00920
00921
00922
00923
00924
00925
00926
00927
00928
00929
00930
00931
00932
00933
00934
00935 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00936
00937 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00938 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00939 _mm_mul_sd ( a.vec[1], b.vec[1])
00940 );
00941 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00942 double s;
00943 _mm_store_sd(&s,res);
00944 return s;
00945 }
00946
00947 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00948
00949 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00950 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00951
00952 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00953
00954 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
00955 __m128d m1 = _mm_sub_pd ( l1 , l2 );
00956 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
00957 m1 = _mm_xor_pd ( m1 , neg );
00958
00959 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
00960
00961 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00962 return mathSSE::Vec4D(m1, m2);
00963 }
00964
00965
00966 inline double
00967 __attribute__((always_inline)) __attribute__ ((pure))
00968 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00969 return dot(a.xy(),b.xy());
00970 }
00971
00972 #endif // CMS_USE_AVX
00973
00974
00975
00976 namespace mathSSE {
00977 template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00978 template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00979 template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00980 #ifdef CMS_USE_AVX
00981 template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
00982 #else
00983 template<> inline Vec4D sqrt(Vec4D v) {
00984 return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00985 }
00986 #endif
00987 }
00988
00989
00990 #include "DataFormats/Math/interface/sse_mathfun.h"
00991 namespace mathSSE {
00992 inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00993 inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00994 inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00995 inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00996 inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00997
00998 inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00999 inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
01000 inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
01001 inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
01002 inline void sincos(float f, float & s, float & c) {
01003 __m128 vs, vc;
01004 sincos_ps(_mm_load_ss(&f),&vs, &vc);
01005 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
01006 }
01007 }
01008 #endif // CMS_USE_SSE
01009
01010
01011 #include <iosfwd>
01012 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
01013 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
01014 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
01015 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
01016
01017 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
01018 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
01019
01020
01021 #endif // DataFormat_Math_SSEVec_H