CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
5 #include <x86intrin.h>
6 #define CMS_USE_SSE
7 
8 #else
9 
10 #ifdef __SSE2__
11 #define CMS_USE_SSE
12 
13 #include <mmintrin.h>
14 #include <emmintrin.h>
15 #endif
16 #ifdef __SSE3__
17 #include <pmmintrin.h>
18 #endif
19 #ifdef __SSE4_1__
20 #include <smmintrin.h>
21 #endif
22 
23 #endif
24 
25 #include<cmath>
26 
27 namespace mathSSE {
28  template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
29 }
30 
31 namespace mathSSE {
32  //
33  template<typename T> inline bool samesign(T rh, T lh);
34 
35  template<>
36  inline bool
37  __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
38  int const mask= 0x80000000;
39  return ((rh^lh)&mask) == 0;
40  }
41 
42  template<>
43  inline bool
44  __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
45  long long const mask= 0x8000000000000000LL;
46  return ((rh^lh)&mask) == 0;
47  }
48 
49  template<>
50  inline bool
51  __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
52  union { int i; float f; } a, b;
53  a.f=rh; b.f=lh;
54  return samesign<int>(a.i,b.i);
55  }
56 
57  template<>
58  inline bool
59  __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
60  union { long long i; double f; } a, b;
61  a.f=rh; b.f=lh;
62  return samesign<long long>(a.i,b.i);
63  }
64 }
65 
66 
67 namespace mathSSE {
68 #ifdef CMS_USE_SSE
69  //dot
70  inline __m128 _mm_dot_ps(__m128 v1, __m128 v2) {
71 #ifdef __SSE4_1__
72  return _mm_dp_ps(v1, v2, 0xff);
73 #else
74  __m128 mul = _mm_mul_ps(v1, v2);
75 #ifdef __SSE3__
76  mul = _mm_hadd_ps(mul,mul);
77  return _mm_hadd_ps(mul,mul);
78 #else
79  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
80  mul = _mm_add_ps(mul, swp);
81  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
82  return _mm_add_ps(mul, swp);
83 #endif
84 #endif
85  }
86 
87 
88  // cross (just 3x3)
89  inline __m128 _mm_cross_ps(__m128 v1, __m128 v2) {
90  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
91  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
92 
93  __m128 v5 = _mm_mul_ps(v3, v4);
94 
95  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
96  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
97 
98  v3 = _mm_mul_ps(v3, v4);
99  const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
100  return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
101  }
102 
103 
104 #endif // CMS_USE_SSE
105 
106 
107  template<typename T>
108  struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
109 
110  template<typename T> union Vec4;
111 
112  template<typename T> union Vec2{
113  Vec2() {
114  arr[0] = 0; arr[1] = 0;
115  }
116  Vec2(T f1, T f2) {
117  arr[0] = f1; arr[1] = f2;
118  }
119  explicit Vec2(T f1) {
120  arr[0] = f1; arr[1] = f1;
121  }
122 
123  void set(T f1, T f2) {
124  arr[0] = f1; arr[1] = f2;
125  }
126  Vec2 get1(unsigned int n) const {
127  return Vec2(arr[n],arr[n]);
128  }
129 
130  template<typename U>
132  arr[0] = v[0]; arr[1] = v[1];
133 
134  }
135 
136  inline Vec2(Vec4<T> v4);
137 
138  T & operator[](unsigned int n) {
139  return arr[n];
140  }
141 
142  T operator[](unsigned int n) const {
143  return arr[n];
144  }
145 
146 
147  T arr[2];
148  };
149 
150 
151  template<typename T> union Vec4{
152  Vec4() {
153  arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
154  }
155  Vec4(float f1, float f2, float f3, float f4=0) {
156  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
157  }
158  explicit Vec4(float f1) {
159  set1(f1);
160  }
161  void set(float f1, float f2, float f3, float f4=0) {
162  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
163  }
164  void set1(float f1) {
165  arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
166  }
167  Vec4 get1(unsigned int n) const {
168  return Vec4(arr[n],arr[n],arr[n],arr[n]);
169  }
170 
171  Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
172  Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
173 
174 
175 
176  T __attribute__ ((aligned(16))) arr[4];
178  };
179 
180  template<typename T>
181  inline Vec2<T>::Vec2(Vec4<T> v4) {
182  arr[0]=v4[0];arr[1]=v4[1];
183  }
184 
185 
186 #ifdef CMS_USE_SSE
187 
188  template<>
189  union Vec2<double>;
190  template<>
191  union Vec4<double>;
192 
193  template<typename T> union Mask2{};
194  template<typename T> union Mask4{};
195 
196  template<>
197  union Mask4<float> {
198  __m128 vec;
199  unsigned int __attribute__ ((aligned(16))) mask[4];
200  Mask4() {vec = _mm_setzero_ps();}
201  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
202  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
203  }
204  };
205 
206  template<>
207  union Mask4<double> {
208  __m128d vec[2];
209  unsigned long long __attribute__ ((aligned(16))) mask[4];
210  Mask4() {
211  vec[0] = _mm_setzero_pd();
212  vec[1] = _mm_setzero_pd();
213  }
214  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
215  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
216  }
217  };
218 
219 
220  template<>
221  union Mask2<double> {
222  __m128d vec;
223  unsigned long long __attribute__ ((aligned(16))) mask[2];
224  Mask2() {
225  vec = _mm_setzero_pd();
226  }
227  Mask2(unsigned long long m1, unsigned long long m2) {
228  mask[0]=m1; mask[1]=m2;
229  }
230  };
231 
232  template<>
233  union Vec4<float> {
234  typedef __m128 nativeType;
235  __m128 vec;
236  float __attribute__ ((aligned(16))) arr[4];
237  OldVec<float> o;
238 
239  Vec4(__m128 ivec) : vec(ivec) {}
240 
241  Vec4(OldVec<float> const & ivec) : o(ivec) {}
242 
243  Vec4() {
244  vec = _mm_setzero_ps();
245  }
246 
247 
248  inline Vec4(Vec4<double> ivec);
249 
250  explicit Vec4(float f1) {
251  set1(f1);
252  }
253 
254  Vec4(float f1, float f2, float f3, float f4=0) {
255  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
256  }
257 
258 
259  Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
260  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
261  arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
262  }
263 
264  Vec4( Vec2<float> ivec0, float f3, float f4=0) {
265  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
266  arr[2] = f3; arr[3] = f4;
267  }
268 
269  Vec4( Vec2<float> ivec0) {
270  vec = _mm_setzero_ps();
271  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
272  }
273 
274 
275  // for masking
276  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
277  Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
278  }
279 
280  void set(float f1, float f2, float f3, float f4=0) {
281  vec = _mm_set_ps(f4, f3, f2, f1);
282  }
283 
284  void set1(float f1) {
285  vec = _mm_set1_ps(f1);
286  }
287 
288  Vec4 get1(unsigned int n) const {
289  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(n, n, n, n));
290  }
291 
292  float & operator[](unsigned int n) {
293  return arr[n];
294  }
295 
296  float operator[](unsigned int n) const {
297  return arr[n];
298  }
299 
300  Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
301  Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
302 
303  };
304 
305 
306  template<>
307  union Vec2<double> {
308  typedef __m128d nativeType;
309  __m128d vec;
310  double __attribute__ ((aligned(16))) arr[2];
311 
312  Vec2(__m128d ivec) : vec(ivec) {}
313 
314  Vec2() {
315  vec = _mm_setzero_pd();
316  }
317 
318 
319  inline Vec2(Vec2<float> ivec);
320 
321 
322  Vec2(double f1, double f2) {
323  arr[0] = f1; arr[1] = f2;
324  }
325 
326  explicit Vec2(double f1) {
327  set1(f1);
328  }
329 
330  inline Vec2(Vec4<double> v4);
331 
332  // for masking
333  void setMask(unsigned long long m1, unsigned long long m2) {
334  Mask2<double> mask(m1,m2); vec=mask.vec;
335  }
336 
337 
338  void set(double f1, double f2) {
339  arr[0] = f1; arr[1] = f2;
340  }
341 
342  void set1(double f1) {
343  vec = _mm_set1_pd(f1);
344  }
345 
346  Vec2 get1(unsigned int n) const {
347  return Vec2(arr[n],arr[n]);
348  }
349 
350  double operator[](unsigned int n) const {
351  return arr[n];
352  }
353  };
354 
355 
356  template<>
357  union Vec4<double> {
358  __m128d vec[2];
359  double __attribute__ ((aligned(16))) arr[4];
360  OldVec<double> o;
361 
362  Vec4(__m128d ivec[]) {
363  vec[0] = ivec[0];
364  vec[1] = ivec[1];
365  }
366 
367  Vec4(__m128d ivec0, __m128d ivec1) {
368  vec[0] = ivec0;
369  vec[1] = ivec1;
370  }
371 
372  Vec4() {
373  vec[0] = _mm_setzero_pd();
374  vec[1] = _mm_setzero_pd();
375  }
376 
377  Vec4( Vec4<float> ivec) {
378  vec[0] = _mm_cvtps_pd(ivec.vec);
379  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
380  }
381 
382  explicit Vec4(double f1) {
383  set1(f1);
384  }
385 
386  Vec4(double f1, double f2, double f3, double f4=0) {
387  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
388  }
389 
390  Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
391  vec[0] = ivec0.vec;
392  vec[1] = ivec1.vec;
393  }
394 
395  Vec4( Vec2<double> ivec0, double f3, double f4=0) {
396  vec[0] = ivec0.vec;
397  arr[2] = f3; arr[3] = f4;
398  }
399 
400  Vec4( Vec2<double> ivec0) {
401  vec[0] = ivec0.vec;
402  vec[1] = _mm_setzero_pd();
403  }
404 
405 
406  Vec4(OldVec<double> const & ivec) : o(ivec) {}
407 
408  // for masking
409  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
410  Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
411  }
412 
413 
414  void set(double f1, double f2, double f3, double f4=0) {
415  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
416  }
417 
418  void set1(double f1) {
419  vec[0] = vec[1]= _mm_set1_pd(f1);
420  }
421 
422 
423  Vec4 get1(unsigned int n) const {
424  return Vec4(arr[n],arr[n],arr[n],arr[n]);
425  }
426 
427  double & operator[](unsigned int n) {
428  return arr[n];
429  }
430 
431  double operator[](unsigned int n) const {
432  return arr[n];
433  }
434 
435  Vec2<double> xy() const { return vec[0];}
436  Vec2<double> zw() const { return vec[1];}
437 
438  };
439 
440 
441  inline Vec4<float>::Vec4(Vec4<double> ivec) {
442  vec = _mm_cvtpd_ps(ivec.vec[0]);
443  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
444  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
445  }
446 
447 
448 #endif // CMS_USE_SSE
449 
456 
457  template<typename T>
458  struct As3D {
459  Vec4<T> const & v;
460  As3D(Vec4<T> const &iv ) : v(iv){}
461  };
462 
463  template<typename T>
464  inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
465 
466 }
467 
468 #ifdef CMS_USE_SSE
469 
470 
471 //float op
472 
473 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
474  using mathSSE::_mm_dot_ps;
475  float s;
476  _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
477  return s;
478 }
479 
481  using mathSSE::_mm_cross_ps;
482  return _mm_cross_ps(a.vec,b.vec);
483 }
484 
485 
486 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
487  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
488 }
489 
490 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
491  return _mm_cmpeq_ps(a.vec,b.vec);
492 }
493 
494 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
495  return _mm_cmpgt_ps(a.vec,b.vec);
496 }
497 
498 #ifdef __SSE3__
499 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
500  return _mm_hadd_ps(a.vec,b.vec);
501 }
502 #endif
503 
504 
506  const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
507  return _mm_xor_ps(a.vec,neg);
508 }
509 
511  return _mm_and_ps(a.vec,b.vec);
512 }
514  return _mm_or_ps(a.vec,b.vec);
515 }
517  return _mm_xor_ps(a.vec,b.vec);
518 }
519 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
520  return _mm_andnot_ps(a.vec,b.vec);
521 }
522 
523 
525  return _mm_add_ps(a.vec,b.vec);
526 }
527 
529  return _mm_sub_ps(a.vec,b.vec);
530 }
531 
533  return _mm_mul_ps(a.vec,b.vec);
534 }
535 
537  return _mm_div_ps(a.vec,b.vec);
538 }
539 
540 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
541  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
542 }
543 
544 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
545  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
546 }
547 
548 //
549 // float op 2d (use the 4d one...)
550 //
552  return -mathSSE::Vec4F(a);
553 }
554 
555 
557  return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
558 }
559 
561  return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
562 }
563 
565  return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
566 }
567 
569  return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
570 }
571 
572 
573 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
574  return s*mathSSE::Vec4F(a);
575 }
576 
577 inline mathSSE::Vec4F operator*(float s,mathSSE::Vec2F a) {
578  return s*mathSSE::Vec4F(a);
579 }
580 
581 
582 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
583 
584 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
585  return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
586 }
587 
588 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
589 
590 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
591  return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
592 }
593 
594 
596 // double op 2d
597 //
598 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
599  vec = v4.vec[0];
600 }
601 
602 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
603  arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
604 }
605 
607  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
608  return _mm_xor_pd(a.vec,neg);
609 }
610 
611 
613  return _mm_and_pd(a.vec,b.vec);
614 }
616  return _mm_or_pd(a.vec,b.vec);
617 }
619  return _mm_xor_pd(a.vec,b.vec);
620 }
621 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
622  return _mm_andnot_pd(a.vec,b.vec);
623 }
624 
625 
627  return _mm_add_pd(a.vec,b.vec);
628 }
629 
631  return _mm_sub_pd(a.vec,b.vec);
632 }
633 
635  return _mm_mul_pd(a.vec,b.vec);
636 }
637 
639  return _mm_div_pd(a.vec,b.vec);
640 }
641 
642 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
643  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
644 }
645 
646 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
647  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
648 }
649 
650 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
651 
652 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
653  __m128d res = _mm_mul_pd ( a.vec, b.vec);
654  res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
655  double s;
656  _mm_store_sd(&s,res);
657  return s;
658 }
659 
660 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
661 
662 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
663  __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
664  res = _mm_mul_pd ( a.vec , res );
665  res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
666  double s;
667  _mm_store_sd(&s,res);
668  return s;
669 }
670 
671 
672 // double op 3d
673 
674 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
675  return
676  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
677  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
678 }
679 
681  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
682  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
683 }
684 
685 
687  return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
688 }
689 
690 
692  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
693 }
695  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
696 }
698  return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
699 }
700 
701 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
702  __m128d res = _mm_set1_pd(a);
703  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
704 }
705 
706 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
707  __m128d res = _mm_set1_pd(a);
708  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
709 }
710 
711 // mix algebra (creates ambiguities...)
712 /*
713 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
714  return a + mathSSE::Vec4D(b);
715 }
716 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
717  return a + mathSSE::Vec4D(b);
718 }
719 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
720  return a - mathSSE::Vec4D(b);
721 }
722 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
723  return a - mathSSE::Vec4D(b);
724 }
725 */
726 
727 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
728 
729 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
730  __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
731  _mm_mul_sd ( a.vec[1], b.vec[1])
732  );
733  res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
734  double s;
735  _mm_store_sd(&s,res);
736  return s;
737 }
738 
739 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
740 
741 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
742  const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
743  // lh .z * rh .x , lh .z * rh .y
744  __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
745  // rh .z * lh .x , rh .z * lh .y
746  __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
747  __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
748  m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
749  m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
750  // lh .x * rh .y , lh .y * rh .x
751  l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
752  // lh .x * rh .y - lh .y * rh .x
753  __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
754 
755  return mathSSE::Vec4D( m1 , m2 );
756 }
757 
758 
759 
760 // sqrt
761 namespace mathSSE {
762  template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
763  template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
764  template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
765  template<> inline Vec4D sqrt(Vec4D v) {
766  return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
767  }
768 }
769 
770 // chephes func
772 namespace mathSSE {
773  inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
774  inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
775  inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
776  inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
777  inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
778 
779  inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
780  inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
781  inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
782  inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
783  inline void sincos(float f, float & s, float & c) {
784  __m128 vs, vc;
785  sincos_ps(_mm_load_ss(&f),&vs, &vc);
786  _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
787  }
788 }
789 #endif // CMS_USE_SSE
790 
791 
792 #include <iosfwd>
793 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
794 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
795 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
796 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
797 
798 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
799 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
800 
801 
802 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
Definition: SSEVec.h:138
int i
Definition: DBlmapReader.cc:9
void set1(float f1)
Definition: SSEVec.h:164
Vec4< double > Vec4D
Definition: SSEVec.h:455
Vec4< float > Vec3F
Definition: SSEVec.h:452
Vec4< float > Vec4F
Definition: SSEVec.h:451
Vec2 get1(unsigned int n) const
Definition: SSEVec.h:126
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Sin< T >::type sin(const T &t)
Definition: Sin.h:22
T arr[2]
Definition: SSEVec.h:147
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
void set(T f1, T f2)
Definition: SSEVec.h:123
strbitset operator|(const strbitset &l, const strbitset &r)
Definition: strbitset.cc:15
bool operator==(const CaloTower &t1, const CaloTower &t2)
Definition: CaloTower.h:211
Exp< T >::type exp(const T &t)
Definition: Exp.h:22
bool int lh
Definition: SSEVec.h:37
OldVec< T > o
Definition: SSEVec.h:177
std::ostream & operator<<(std::ostream &out, const ALILine &li)
Definition: ALILine.cc:187
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
Vec4 get1(unsigned int n) const
Definition: SSEVec.h:167
T operator[](unsigned int n) const
Definition: SSEVec.h:142
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:155
As3D< T > as3D(Vec4< T > const &v)
Definition: SSEVec.h:464
Vec2< T > zw() const
Definition: SSEVec.h:172
T sqrt(T t)
Definition: SSEVec.h:28
Cos< T >::type cos(const T &t)
Definition: Cos.h:22
Vec4< double > Vec3D
Definition: SSEVec.h:454
struct mathSSE::Rot3 __attribute__
return samesign< long long >(a.i, b.i)
tuple out
Definition: dbtoconf.py:99
Vec4(float f1)
Definition: SSEVec.h:158
return samesign< int >(a.i, b.i)
Vec2< float > Vec2F
Definition: SSEVec.h:450
Log< T >::type log(const T &t)
Definition: Log.h:22
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:460
Vec2(T f1, T f2)
Definition: SSEVec.h:116
a f
Definition: SSEVec.h:53
Vec2(Vec2< U > v)
Definition: SSEVec.h:131
bool samesign(T rh, T lh)
T dot(const Basic3DVector &v) const
Scalar product, or &quot;dot&quot; product, with a vector of same type.
double a
Definition: hdecay.h:121
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
TwoHolder< T, U > operator&(const T &iT, const U &iU)
string s
Definition: asciidump.py:422
std::auto_ptr< ParameterDescriptionNode > operator^(ParameterDescriptionNode const &node_left, ParameterDescriptionNode const &node_right)
Vec4< T > const & v
Definition: SSEVec.h:459
long double T
mathSSE::Vec4< T > v
def template
Definition: svgfig.py:520
T __attribute__((aligned(16))) arr[4]
Vec2< double > Vec2D
Definition: SSEVec.h:453
void set(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:161
Vec2< T > xy() const
Definition: SSEVec.h:171
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or &quot;cross&quot; product, with a vector of same type.
Vec2(T f1)
Definition: SSEVec.h:119