CMS 3D CMS Logo

SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && !defined(__PPC64__) && !defined(__powerpc__)
5 #if defined(__GNUC__)
6 #include <x86intrin.h>
7 #define CMS_USE_SSE
8 #ifdef __AVX__
9 #define CMS_USE_AVX
10 #endif /* __AVX__ */
11 #endif /* defined(__GNUC__) */
12 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
13 
14 #include<cmath>
15 
16 
17 namespace mathSSE {
18  template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
19 }
20 
21 
22 namespace mathSSE {
23 #ifdef CMS_USE_SSE
24  //dot
25  inline __m128
26  __attribute__((always_inline)) __attribute__ ((pure))
27  _mm_dot_ps(__m128 v1, __m128 v2) {
28 #ifdef __SSE4_1__
29  return _mm_dp_ps(v1, v2, 0xff);
30 #else
31  __m128 mul = _mm_mul_ps(v1, v2);
32 #ifdef __SSE3__
33  mul = _mm_hadd_ps(mul,mul);
34  return _mm_hadd_ps(mul,mul);
35 #else
36  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37  mul = _mm_add_ps(mul, swp);
38  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39  return _mm_add_ps(mul, swp);
40 #endif
41 #endif
42  }
43 
44 
45  // cross (just 3x3)
46  inline __m128
47  __attribute__((always_inline)) __attribute__ ((pure))
48  _mm_cross_ps(__m128 v1, __m128 v2) {
49  // same order is _MM_SHUFFLE(3,2,1,0)
50  // x2, z1,z1
51  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
52  // y1, x2,y2
53  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
54 
55  __m128 v5 = _mm_mul_ps(v3, v4);
56 
57  // x1, z2,z2
58  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
59  // y2, x1,y1
60  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
61 
62  v3 = _mm_mul_ps(v3, v4);
63  const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
64  return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
65  }
66 #endif // CMS_USE_SSE
67 
68 #ifdef CMS_USE_AVX
69  inline __m256d
70  __attribute__((always_inline)) __attribute__ ((pure))
71  _mm256_dot_pd(__m256d v1, __m256d v2) {
72  __m256d mul = _mm256_mul_pd(v1, v2);
73  mul = _mm256_hadd_pd(mul,mul);
74  __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
75  return _mm256_add_pd(mul,tmp);
76  }
77 
78  inline __m256d
79  __attribute__((always_inline)) __attribute__ ((pure))
80  _mm256_cross_pd(__m256d v1, __m256d v2) {
81 
82  __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
83  v3 = _mm256_permute_pd(v3,0);
84 
85  __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
86  v4 = _mm256_permute_pd(v4,5);
87 
88  __m256d v5 = _mm256_mul_pd(v3, v4);
89 
90  v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
91  v3 = _mm256_permute_pd(v3,0);
92 
93  v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
94  v4 = _mm256_permute_pd(v4,5);
95 
96  v3 = _mm256_mul_pd(v3, v4);
97  const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
98  return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
99 
100  }
101 
102 #endif // CMS_USE_AVX
103 
104 
105 
106  template<typename T>
107  struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
108 #ifdef CMS_USE_AVX
109  template<>
110  struct OldVec<double> { double theX; double theY; double theZ; double theW;} __attribute__ ((aligned (32)));
111 #endif
112 
113  template<typename T> union Vec4;
114 
115  template<typename T> union Vec2{
116  Vec2() {
117  arr[0] = 0; arr[1] = 0;
118  }
119  Vec2(T f1, T f2) {
120  arr[0] = f1; arr[1] = f2;
121  }
122  explicit Vec2(T f1) {
123  arr[0] = f1; arr[1] = f1;
124  }
125 
126  void set(T f1, T f2) {
127  arr[0] = f1; arr[1] = f2;
128  }
129 
130  template <int N>
131  Vec2 get1() const {
132  return Vec2(arr[N],arr[N]);
133  }
134 
135  /*
136  Vec2 get1(unsigned int n) const {
137  return Vec2(arr[n],arr[n]);
138  }
139  */
140 
141  template<typename U>
143  arr[0] = v[0]; arr[1] = v[1];
144 
145  }
146 
147  inline Vec2(Vec4<T> v4);
148 
149  T & operator[](unsigned int n) {
150  return arr[n];
151  }
152 
153  T operator[](unsigned int n) const {
154  return arr[n];
155  }
156 
157 
158  T arr[2];
159  };
160 
161 
162  template<typename T> union Vec4{
163  Vec4() {
164  arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
165  }
166  Vec4(float f1, float f2, float f3, float f4=0) {
167  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
168  }
169  explicit Vec4(float f1) {
170  set1(f1);
171  }
172  void set(float f1, float f2, float f3, float f4=0) {
173  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
174  }
175  void set1(float f1) {
176  arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
177  }
178  template <int N>
179  Vec4 get1() const {
180  return Vec4(arr[N],arr[N],arr[N],arr[N]);
181  }
182  /*
183  Vec4 get1(unsigned int n) const {
184  return Vec4(arr[n],arr[n],arr[n],arr[n]);
185  }
186  */
187 
188  Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
189  Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
190 
191 
192 
193  T __attribute__ ((aligned(16))) arr[4];
195  };
196 
197  template<typename T>
198  inline Vec2<T>::Vec2(Vec4<T> v4) {
199  arr[0]=v4[0];arr[1]=v4[1];
200  }
201 
202 
203 #ifdef CMS_USE_SSE
204 
205  template<>
206  union Vec2<double>;
207  template<>
208  union Vec4<double>;
209 
210  template<typename T> union Mask2{};
211  template<typename T> union Mask4{};
212 
213  template<>
214  union Mask4<float> {
215  __m128 vec;
216  unsigned int __attribute__ ((aligned(16))) mask[4];
217  Mask4() {vec = _mm_setzero_ps();}
218  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
219  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
220  }
221  };
222 
223 #ifdef CMS_USE_AVX
224  template<>
225  union Mask4<double> {
226  __m256d vec;
227  unsigned long long __attribute__ ((aligned(32))) mask[4];
228  Mask4() {
229  vec = _mm256_setzero_pd();
230  }
231  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
232  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
233  }
234  };
235 #else
236  template<>
237  union Mask4<double> {
238  __m128d vec[2];
239  unsigned long long __attribute__ ((aligned(16))) mask[4];
240  Mask4() {
241  vec[0] = _mm_setzero_pd();
242  vec[1] = _mm_setzero_pd();
243  }
244  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
245  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
246  }
247  };
248 #endif
249 
250  template<>
251  union Mask2<double> {
252  __m128d vec;
253  unsigned long long __attribute__ ((aligned(16))) mask[2];
254  Mask2() {
255  vec = _mm_setzero_pd();
256  }
257  Mask2(unsigned long long m1, unsigned long long m2) {
258  mask[0]=m1; mask[1]=m2;
259  }
260  };
261 
262  template<>
263  union Vec4<float> {
264  typedef __m128 nativeType;
265  __m128 vec;
266  float __attribute__ ((aligned(16))) arr[4];
268 
269  Vec4(__m128 ivec) : vec(ivec) {}
270 
271  Vec4(OldVec<float> const & ivec) : o(ivec) {}
272 
273  Vec4() {
274  vec = _mm_setzero_ps();
275  }
276 
277 
278  inline Vec4(Vec4<double> ivec);
279 
280  explicit Vec4(float f1) {
281  set1(f1);
282  }
283 
284  Vec4(float f1, float f2, float f3, float f4=0) {
285  vec = _mm_set_ps(f4, f3, f2, f1);
286  }
287 
288 
289  Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
290  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
291  arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
292  }
293 
294  Vec4( Vec2<float> ivec0, float f3, float f4=0) {
295  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
296  arr[2] = f3; arr[3] = f4;
297  }
298 
299  Vec4( Vec2<float> ivec0) {
300  vec = _mm_setzero_ps();
301  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
302  }
303 
304 
305  // for masking
306  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
307  Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
308  }
309 
310  void set(float f1, float f2, float f3, float f4=0) {
311  vec = _mm_set_ps(f4, f3, f2, f1);
312  }
313 
314  void set1(float f1) {
315  vec = _mm_set1_ps(f1);
316  }
317 
318  template <int N>
319  Vec4 get1() const {
320  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
321  }
322 
323  float & operator[](unsigned int n) {
324  return arr[n];
325  }
326 
327  float operator[](unsigned int n) const {
328  return arr[n];
329  }
330 
331  Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
332  Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
333 
334  };
335 
336 
337  template<>
338  union Vec2<double> {
339  typedef __m128d nativeType;
340  __m128d vec;
341  double __attribute__ ((aligned(16))) arr[2];
342 
343  Vec2(__m128d ivec) : vec(ivec) {}
344 
345  Vec2() {
346  vec = _mm_setzero_pd();
347  }
348 
349 
350  inline Vec2(Vec2<float> ivec);
351 
352 
353  Vec2(double f1, double f2) {
354  vec = _mm_set_pd(f2,f1);
355  }
356 
357  explicit Vec2(double f1) {
358  set1(f1);
359  }
360 
361  inline Vec2(Vec4<double> v4);
362 
363  // for masking
364  void setMask(unsigned long long m1, unsigned long long m2) {
365  Mask2<double> mask(m1,m2); vec=mask.vec;
366  }
367 
368 
369  void set(double f1, double f2) {
370  vec = _mm_set_pd(f2,f1);
371  }
372 
373  void set1(double f1) {
374  vec = _mm_set1_pd(f1);
375  }
376 
377  template <int N>
378  Vec2 get1() const {
379  return Vec2(arr[N],arr[N]);
380  }
381  /*
382  Vec2 get1(unsigned int n) const {
383  return Vec2(arr[n],arr[n]);
384  }
385  */
386  double & operator[](unsigned int n) {
387  return arr[n];
388  }
389  double operator[](unsigned int n) const {
390  return arr[n];
391  }
392  };
393 
394 
395 
396 
397 #ifdef CMS_USE_AVX
398 }// namespace mathSSE
399 #include "AVXVec.h"
400 
401 namespace mathSSE {
402 #else
403  template<>
404  union Vec4<double> {
405  __m128d vec[2];
406  double __attribute__ ((aligned(16))) arr[4];
408 
409  Vec4(__m128d ivec[]) {
410  vec[0] = ivec[0];
411  vec[1] = ivec[1];
412  }
413 
414  Vec4(__m128d ivec0, __m128d ivec1) {
415  vec[0] = ivec0;
416  vec[1] = ivec1;
417  }
418 
419  Vec4() {
420  vec[0] = _mm_setzero_pd();
421  vec[1] = _mm_setzero_pd();
422  }
423 
424  Vec4( Vec4<float> ivec) {
425  vec[0] = _mm_cvtps_pd(ivec.vec);
426  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
427  }
428 
429  explicit Vec4(double f1) {
430  set1(f1);
431  }
432 
433  Vec4(double f1, double f2, double f3, double f4=0) {
434  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
435  }
436 
437  Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
438  vec[0] = ivec0.vec;
439  vec[1] = ivec1.vec;
440  }
441 
442  Vec4( Vec2<double> ivec0, double f3, double f4=0) {
443  vec[0] = ivec0.vec;
444  arr[2] = f3; arr[3] = f4;
445  }
446 
447  Vec4( Vec2<double> ivec0) {
448  vec[0] = ivec0.vec;
449  vec[1] = _mm_setzero_pd();
450  }
451 
452 
453  Vec4(OldVec<double> const & ivec) : o(ivec) {}
454 
455  // for masking
456  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
457  Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
458  }
459 
460 
461  void set(double f1, double f2, double f3, double f4=0) {
462  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
463  }
464 
465  void set1(double f1) {
466  vec[0] = vec[1]= _mm_set1_pd(f1);
467  }
468 
469 
470  template<int N>
471  Vec4 get1() const {
472  return Vec4(arr[N],arr[N],arr[N],arr[N]);
473  }
474 
475  double & operator[](unsigned int n) {
476  return arr[n];
477  }
478 
479  double operator[](unsigned int n) const {
480  return arr[n];
481  }
482 
483  Vec2<double> xy() const { return vec[0];}
484  Vec2<double> zw() const { return vec[1];}
485 
486  };
487 
488 
489  inline Vec4<float>::Vec4(Vec4<double> ivec) {
490  vec = _mm_cvtpd_ps(ivec.vec[0]);
491  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
492  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
493  }
494 #endif // CMS_USE_AVX
495 
496 
497 #endif // CMS_USE_SSE
498 
505 
506  template<typename T>
507  struct As3D {
508  Vec4<T> const & v;
509  As3D(Vec4<T> const &iv ) : v(iv){}
510  };
511 
512  template<typename T>
513  inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
514 
515 } // namespace mathSSE
516 
517 #ifdef CMS_USE_SSE
518 
519 
520 //float op
521 
522 
524  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
525 }
526 
528  return _mm_cmpeq_ps(a.vec,b.vec);
529 }
530 
532  return _mm_cmpgt_ps(a.vec,b.vec);
533 }
534 
535 #ifdef __SSE3__
537  return _mm_hadd_ps(a.vec,b.vec);
538 }
539 #endif
540 
541 
543  const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
544  return _mm_xor_ps(a.vec,neg);
545 }
546 
548  return _mm_and_ps(a.vec,b.vec);
549 }
551  return _mm_or_ps(a.vec,b.vec);
552 }
554  return _mm_xor_ps(a.vec,b.vec);
555 }
557  return _mm_andnot_ps(a.vec,b.vec);
558 }
559 
560 
562  return _mm_add_ps(a.vec,b.vec);
563 }
564 
566  return _mm_sub_ps(a.vec,b.vec);
567 }
568 
570  return _mm_mul_ps(a.vec,b.vec);
571 }
572 
574  return _mm_div_ps(a.vec,b.vec);
575 }
576 
578  return _mm_min_ps(a.vec,b.vec);
579 }
580 
582  return _mm_max_ps(a.vec,b.vec);
583 }
584 
585 
586 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
587  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
588 }
589 
590 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
591  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
592 }
593 
594 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b,float a) {
595  return _mm_div_ps(b.vec,_mm_set1_ps(a));
596 }
597 
598 
599 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
600  using mathSSE::_mm_dot_ps;
601  float s;
602  _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
603  return s;
604 }
605 
607  using mathSSE::_mm_cross_ps;
608  return _mm_cross_ps(a.vec,b.vec);
609 }
610 
611 
612 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
613  mathSSE::Vec4F mul = a*b;
614 #ifdef __SSE3__
615  mul = hadd(mul,mul);
616 #else
617  __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
618  mul.vec = _mm_add_ps(mul.vec, swp);
619 #endif
620  float s;
621  _mm_store_ss(&s,mul.vec);
622  return s;
623 }
624 
625 
626 //
627 // float op 2d (use the 4d one...)
628 //
630  return -mathSSE::Vec4F(a);
631 }
632 
633 
635  return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
636 }
637 
639  return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
640 }
641 
643  return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
644 }
645 
647  return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
648 }
649 
651  return min(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
652 }
653 
656 }
657 
658 
659 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
660  return s*mathSSE::Vec4F(a);
661 }
662 
663 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) {
664  return s*mathSSE::Vec4F(a);
665 }
666 
667 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) {
668  return mathSSE::Vec4F(a)/s;
669 }
670 
671 
672 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
673 
674 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
675  return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
676 }
677 
678 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
679 
680 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
681  return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
682 }
683 
684 
686 // double op 2d
687 //
688 
689 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
690 #ifdef CMS_USE_AVX
691  vec = _mm256_castpd256_pd128(v4.vec);
692 #else
693  vec = v4.vec[0];
694 #endif
695 }
696 
697 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
698  arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
699 }
700 
702  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
703  return _mm_xor_pd(a.vec,neg);
704 }
705 
706 
708  return _mm_and_pd(a.vec,b.vec);
709 }
711  return _mm_or_pd(a.vec,b.vec);
712 }
714  return _mm_xor_pd(a.vec,b.vec);
715 }
717  return _mm_andnot_pd(a.vec,b.vec);
718 }
719 
720 #ifdef __SSE3__
722  return _mm_hadd_pd(a.vec,b.vec);
723 }
724 #endif
725 
727  return _mm_add_pd(a.vec,b.vec);
728 }
729 
731  return _mm_sub_pd(a.vec,b.vec);
732 }
733 
735  return _mm_mul_pd(a.vec,b.vec);
736 }
737 
739  return _mm_div_pd(a.vec,b.vec);
740 }
741 
742 
744  return _mm_min_pd(a.vec,b.vec);
745 }
746 
748  return _mm_max_pd(a.vec,b.vec);
749 }
750 
751 
752 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
753  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
754 }
755 
756 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
757  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
758 }
759 
760 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b,double a) {
761  return _mm_div_pd(b.vec,_mm_set1_pd(a));
762 }
763 
764 
765 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
766 
767 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
768  __m128d res = _mm_mul_pd ( a.vec, b.vec);
769 #ifdef __SSE3__
770  res = _mm_hadd_pd(res,res);
771 #else
772  res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
773 #endif
774  double s;
775  _mm_store_sd(&s,res);
776  return s;
777 }
778 
779 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
780 
781 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
782  __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
783  res = _mm_mul_pd ( a.vec , res );
784  res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
785  double s;
786  _mm_store_sd(&s,res);
787  return s;
788 }
789 
790 
791 #ifndef CMS_USE_AVX
792 // double op 3d
793 
794 
795 #ifdef __SSE3__
796 // consistent with AVX...
799 }
800 #endif
801 
802 
803 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
804  return
805  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
806  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
807 }
808 
810  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
811  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
812 }
813 
814 
816  return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
817 }
818 
819 
821  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
822 }
824  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
825 }
827  return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
828 }
829 
830 
832  using namespace mathSSE;
834 }
835 
837  using namespace mathSSE;
839 }
840 
841 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
842  __m128d res = _mm_set1_pd(a);
843  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
844 }
845 
846 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
847  __m128d res = _mm_set1_pd(a);
848  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
849 }
850 
851 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
852  a = 1./a;
853  return a*b;
854 }
855 
856 // mix algebra (creates ambiguities...)
857 /*
858 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
859  return a + mathSSE::Vec4D(b);
860 }
861 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
862  return a + mathSSE::Vec4D(b);
863 }
864 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
865  return a - mathSSE::Vec4D(b);
866 }
867 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
868  return a - mathSSE::Vec4D(b);
869 }
870 */
871 
872 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
873 
874 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
875  __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
876  _mm_mul_sd ( a.vec[1], b.vec[1])
877  );
878  res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
879  double s;
880  _mm_store_sd(&s,res);
881  return s;
882 }
883 
884 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
885 
887  const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
888  // lh .z * rh .x , lh .z * rh .y
889  __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
890  // rh .z * lh .x , rh .z * lh .y
891  __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
892  __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
893  m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
894  m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
895  // lh .x * rh .y , lh .y * rh .x
896  l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
897  // lh .x * rh .y - lh .y * rh .x
898  __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
899  return mathSSE::Vec4D(m1, m2);
900 }
901 
902 
903 inline double
904 __attribute__((always_inline)) __attribute__ ((pure))
905 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
906  return dot(a.xy(),b.xy());
907 }
908 
909 #endif // CMS_USE_AVX
910 
911 
912 // sqrt
913 namespace mathSSE {
914  template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
915  template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
916  template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
917 #ifdef CMS_USE_AVX
918  template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
919 #else
920  template<> inline Vec4D sqrt(Vec4D v) {
921  return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
922  }
923 #endif
924 }
925 
926 #endif // CMS_USE_SSE
927 
928 
929 #include <iosfwd>
930 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
931 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
932 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
933 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
934 
935 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
936 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
937 
938 
939 #endif // DataFormat_Math_SSEVec_H
Definition: AVXVec.h:6
T & operator[](unsigned int n)
Definition: SSEVec.h:149
Vec4 get1() const
Definition: SSEVec.h:179
void set1(float f1)
Definition: SSEVec.h:175
Vec4< double > Vec4D
Definition: SSEVec.h:504
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:128
Vec4< float > Vec3F
Definition: SSEVec.h:501
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:122
Vec2< double > xy() const
Definition: AVXVec.h:84
Vec4< float > Vec4F
Definition: SSEVec.h:500
ExtVec< T, 4 > Vec4
Definition: ExtVec.h:62
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:119
T arr[2]
Definition: SSEVec.h:158
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:104
OldVec< T > o
Definition: SSEVec.h:194
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
Definition: Electron.h:4
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
Definition: SSEVec.cc:16
T operator[](unsigned int n) const
Definition: SSEVec.h:153
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:166
As3D< T > as3D(Vec4< T > const &v)
Definition: SSEVec.h:513
Vec2< T > zw() const
Definition: SSEVec.h:189
T sqrt(T t)
Definition: SSEVec.h:18
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
T operator[](int i) const
Vec4< double > Vec3D
Definition: SSEVec.h:503
T min(T a, T b)
Definition: MathUtil.h:58
struct mathSSE::Rot3 __attribute__
Vec4(float f1)
Definition: SSEVec.h:169
#define N
Definition: blowfish.cc:9
Vec2< float > Vec2F
Definition: SSEVec.h:499
ExtVec< T, 2 > Vec2
Definition: ExtVec.h:63
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:509
Vec2(T f1, T f2)
Definition: SSEVec.h:119
a f
Definition: SIMDVec.h:37
Vec2 get1() const
Definition: SSEVec.h:131
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
Vec2(Vec2< U > v)
Definition: SSEVec.h:142
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
double a
Definition: hdecay.h:121
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:125
auto zw(V v) -> Vec2< typename std::remove_reference< decltype(v[0])>::type >
Definition: ExtVec.h:75
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec4< T > const & v
Definition: SSEVec.h:508
long double T
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:108
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:100
Vec2< double > Vec2D
Definition: SSEVec.h:502
Vec2< T > xy() const
Definition: SSEVec.h:188
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
Vec2(T f1)
Definition: SSEVec.h:122