CMS 3D CMS Logo

SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__)
5 #if defined(__GNUC__)
6 #include <x86intrin.h>
7 #define CMS_USE_SSE
8 #ifdef __AVX__
9 #define CMS_USE_AVX
10 #endif /* __AVX__ */
11 #endif /* defined(__GNUC__) */
12 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
13 
14 #include<cmath>
15 
16 
17 namespace mathSSE {
18  template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
19 }
20 
21 
22 namespace mathSSE {
23 #ifdef CMS_USE_SSE
24  //dot
25  inline __m128
26  __attribute__((always_inline)) __attribute__ ((pure))
27  _mm_dot_ps(__m128 v1, __m128 v2) {
28 #ifdef __SSE4_1__
29  return _mm_dp_ps(v1, v2, 0xff);
30 #else
31  __m128 mul = _mm_mul_ps(v1, v2);
32 #ifdef __SSE3__
33  mul = _mm_hadd_ps(mul,mul);
34  return _mm_hadd_ps(mul,mul);
35 #else
36  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37  mul = _mm_add_ps(mul, swp);
38  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39  return _mm_add_ps(mul, swp);
40 #endif
41 #endif
42  }
43 
44 
45  // cross (just 3x3)
46  inline __m128
47  __attribute__((always_inline)) __attribute__ ((pure))
48  _mm_cross_ps(__m128 v1, __m128 v2) {
49  // same order is _MM_SHUFFLE(3,2,1,0)
50  // x2, z1,z1
51  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
52  // y1, x2,y2
53  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
54 
55  __m128 v5 = _mm_mul_ps(v3, v4);
56 
57  // x1, z2,z2
58  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
59  // y2, x1,y1
60  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
61 
62  v3 = _mm_mul_ps(v3, v4);
63  const __m128i neg = _mm_set_epi32(0,0,0x80000000,0);
64  __m128i ret = __m128i(_mm_sub_ps(v5, v3));
65  return __m128(_mm_xor_si128(ret,neg));
66  }
67 #endif // CMS_USE_SSE
68 
69 #ifdef CMS_USE_AVX
70  inline __m256d
71  __attribute__((always_inline)) __attribute__ ((pure))
72  _mm256_dot_pd(__m256d v1, __m256d v2) {
73  __m256d mul = _mm256_mul_pd(v1, v2);
74  mul = _mm256_hadd_pd(mul,mul);
75  __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
76  return _mm256_add_pd(mul,tmp);
77  }
78 
79  inline __m256d
80  __attribute__((always_inline)) __attribute__ ((pure))
81  _mm256_cross_pd(__m256d v1, __m256d v2) {
82 
83  __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
84  v3 = _mm256_permute_pd(v3,0);
85 
86  __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
87  v4 = _mm256_permute_pd(v4,5);
88 
89  __m256d v5 = _mm256_mul_pd(v3, v4);
90 
91  v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
92  v3 = _mm256_permute_pd(v3,0);
93 
94  v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
95  v4 = _mm256_permute_pd(v4,5);
96 
97  v3 = _mm256_mul_pd(v3, v4);
98  __m256d ret = _mm256_sub_pd(v5, v3);
99  const __m256i neg = _mm256_set_epi64x(0,0,0x8000000000000000,0);
100  return __m256d(_mm256_xor_si256(__m256i(ret), neg));
101  }
102 
103 #endif // CMS_USE_AVX
104 
105 
106 
107  template<typename T>
108  struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
109 #ifdef CMS_USE_AVX
110  template<>
111  struct OldVec<double> { double theX; double theY; double theZ; double theW;} __attribute__ ((aligned (32)));
112 #endif
113 
114  template<typename T> union Vec4;
115 
116  template<typename T> union Vec2{
117  Vec2() {
118  arr[0] = 0; arr[1] = 0;
119  }
120  Vec2(T f1, T f2) {
121  arr[0] = f1; arr[1] = f2;
122  }
123  explicit Vec2(T f1) {
124  arr[0] = f1; arr[1] = f1;
125  }
126 
127  void set(T f1, T f2) {
128  arr[0] = f1; arr[1] = f2;
129  }
130 
131  template <int N>
132  Vec2 get1() const {
133  return Vec2(arr[N],arr[N]);
134  }
135 
136  /*
137  Vec2 get1(unsigned int n) const {
138  return Vec2(arr[n],arr[n]);
139  }
140  */
141 
142  template<typename U>
144  arr[0] = v[0]; arr[1] = v[1];
145 
146  }
147 
148  inline Vec2(Vec4<T> v4);
149 
150  T & operator[](unsigned int n) {
151  return arr[n];
152  }
153 
154  T operator[](unsigned int n) const {
155  return arr[n];
156  }
157 
158 
159  T arr[2];
160  };
161 
162 
163  template<typename T> union Vec4{
164  Vec4() {
165  arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
166  }
167  Vec4(float f1, float f2, float f3, float f4=0) {
168  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
169  }
170  explicit Vec4(float f1) {
171  set1(f1);
172  }
173  void set(float f1, float f2, float f3, float f4=0) {
174  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
175  }
176  void set1(float f1) {
177  arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
178  }
179  template <int N>
180  Vec4 get1() const {
181  return Vec4(arr[N],arr[N],arr[N],arr[N]);
182  }
183  /*
184  Vec4 get1(unsigned int n) const {
185  return Vec4(arr[n],arr[n],arr[n],arr[n]);
186  }
187  */
188 
189  Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
190  Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
191 
192 
193 
194  T __attribute__ ((aligned(16))) arr[4];
196  };
197 
198  template<typename T>
199  inline Vec2<T>::Vec2(Vec4<T> v4) {
200  arr[0]=v4[0];arr[1]=v4[1];
201  }
202 
203 
204 #ifdef CMS_USE_SSE
205 
206  template<>
207  union Vec2<double>;
208  template<>
209  union Vec4<double>;
210 
211  template<typename T> union Mask2{};
212  template<typename T> union Mask4{};
213 
214  template<>
215  union Mask4<float> {
216  __m128 vec;
217  unsigned int __attribute__ ((aligned(16))) mask[4];
218  Mask4() {vec = _mm_setzero_ps();}
219  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
220  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
221  }
222  };
223 
224 #ifdef CMS_USE_AVX
225  template<>
226  union Mask4<double> {
227  __m256d vec;
228  unsigned long long __attribute__ ((aligned(32))) mask[4];
229  Mask4() {
230  vec = _mm256_setzero_pd();
231  }
232  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
233  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
234  }
235  };
236 #else
237  template<>
238  union Mask4<double> {
239  __m128d vec[2];
240  unsigned long long __attribute__ ((aligned(16))) mask[4];
241  Mask4() {
242  vec[0] = _mm_setzero_pd();
243  vec[1] = _mm_setzero_pd();
244  }
245  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
246  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
247  }
248  };
249 #endif
250 
251  template<>
252  union Mask2<double> {
253  __m128d vec;
254  unsigned long long __attribute__ ((aligned(16))) mask[2];
255  Mask2() {
256  vec = _mm_setzero_pd();
257  }
258  Mask2(unsigned long long m1, unsigned long long m2) {
259  mask[0]=m1; mask[1]=m2;
260  }
261  };
262 
263  template<>
264  union Vec4<float> {
265  typedef __m128 nativeType;
266  __m128 vec;
267  float __attribute__ ((aligned(16))) arr[4];
269 
270  Vec4(__m128 ivec) : vec(ivec) {}
271 
272  Vec4(OldVec<float> const & ivec) : o(ivec) {}
273 
274  Vec4() {
275  vec = _mm_setzero_ps();
276  }
277 
278 
279  inline Vec4(Vec4<double> ivec);
280 
281  explicit Vec4(float f1) {
282  set1(f1);
283  }
284 
285  Vec4(float f1, float f2, float f3, float f4=0) {
286  vec = _mm_set_ps(f4, f3, f2, f1);
287  }
288 
289 
290  Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
291  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
292  arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
293  }
294 
295  Vec4( Vec2<float> ivec0, float f3, float f4=0) {
296  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
297  arr[2] = f3; arr[3] = f4;
298  }
299 
300  Vec4( Vec2<float> ivec0) {
301  vec = _mm_setzero_ps();
302  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
303  }
304 
305 
306  // for masking
307  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
308  Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
309  }
310 
311  void set(float f1, float f2, float f3, float f4=0) {
312  vec = _mm_set_ps(f4, f3, f2, f1);
313  }
314 
315  void set1(float f1) {
316  vec = _mm_set1_ps(f1);
317  }
318 
319  template <int N>
320  Vec4 get1() const {
321  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
322  }
323 
324  float & operator[](unsigned int n) {
325  return arr[n];
326  }
327 
328  float operator[](unsigned int n) const {
329  return arr[n];
330  }
331 
332  Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
333  Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
334 
335  };
336 
337 
338  template<>
339  union Vec2<double> {
340  typedef __m128d nativeType;
341  __m128d vec;
342  double __attribute__ ((aligned(16))) arr[2];
343 
344  Vec2(__m128d ivec) : vec(ivec) {}
345 
346  Vec2() {
347  vec = _mm_setzero_pd();
348  }
349 
350 
351  inline Vec2(Vec2<float> ivec);
352 
353 
354  Vec2(double f1, double f2) {
355  vec = _mm_set_pd(f2,f1);
356  }
357 
358  explicit Vec2(double f1) {
359  set1(f1);
360  }
361 
362  inline Vec2(Vec4<double> v4);
363 
364  // for masking
365  void setMask(unsigned long long m1, unsigned long long m2) {
366  Mask2<double> mask(m1,m2); vec=mask.vec;
367  }
368 
369 
370  void set(double f1, double f2) {
371  vec = _mm_set_pd(f2,f1);
372  }
373 
374  void set1(double f1) {
375  vec = _mm_set1_pd(f1);
376  }
377 
378  template <int N>
379  Vec2 get1() const {
380  return Vec2(arr[N],arr[N]);
381  }
382  /*
383  Vec2 get1(unsigned int n) const {
384  return Vec2(arr[n],arr[n]);
385  }
386  */
387  double & operator[](unsigned int n) {
388  return arr[n];
389  }
390  double operator[](unsigned int n) const {
391  return arr[n];
392  }
393  };
394 
395 
396 
397 
398 #ifdef CMS_USE_AVX
399 }// namespace mathSSE
400 #include "AVXVec.h"
401 
402 namespace mathSSE {
403 #else
404  template<>
405  union Vec4<double> {
406  __m128d vec[2];
407  double __attribute__ ((aligned(16))) arr[4];
409 
410  Vec4(__m128d ivec[]) {
411  vec[0] = ivec[0];
412  vec[1] = ivec[1];
413  }
414 
415  Vec4(__m128d ivec0, __m128d ivec1) {
416  vec[0] = ivec0;
417  vec[1] = ivec1;
418  }
419 
420  Vec4() {
421  vec[0] = _mm_setzero_pd();
422  vec[1] = _mm_setzero_pd();
423  }
424 
425  Vec4( Vec4<float> ivec) {
426  vec[0] = _mm_cvtps_pd(ivec.vec);
427  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
428  }
429 
430  explicit Vec4(double f1) {
431  set1(f1);
432  }
433 
434  Vec4(double f1, double f2, double f3, double f4=0) {
435  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
436  }
437 
438  Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
439  vec[0] = ivec0.vec;
440  vec[1] = ivec1.vec;
441  }
442 
443  Vec4( Vec2<double> ivec0, double f3, double f4=0) {
444  vec[0] = ivec0.vec;
445  arr[2] = f3; arr[3] = f4;
446  }
447 
448  Vec4( Vec2<double> ivec0) {
449  vec[0] = ivec0.vec;
450  vec[1] = _mm_setzero_pd();
451  }
452 
453 
454  Vec4(OldVec<double> const & ivec) : o(ivec) {}
455 
456  // for masking
457  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
458  Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
459  }
460 
461 
462  void set(double f1, double f2, double f3, double f4=0) {
463  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
464  }
465 
466  void set1(double f1) {
467  vec[0] = vec[1]= _mm_set1_pd(f1);
468  }
469 
470 
471  template<int N>
472  Vec4 get1() const {
473  return Vec4(arr[N],arr[N],arr[N],arr[N]);
474  }
475 
476  double & operator[](unsigned int n) {
477  return arr[n];
478  }
479 
480  double operator[](unsigned int n) const {
481  return arr[n];
482  }
483 
484  Vec2<double> xy() const { return vec[0];}
485  Vec2<double> zw() const { return vec[1];}
486 
487  };
488 
489 
490  inline Vec4<float>::Vec4(Vec4<double> ivec) {
491  vec = _mm_cvtpd_ps(ivec.vec[0]);
492  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
493  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
494  }
495 #endif // CMS_USE_AVX
496 
497 
498 #endif // CMS_USE_SSE
499 
506 
507  template<typename T>
508  struct As3D {
509  Vec4<T> const & v;
510  As3D(Vec4<T> const &iv ) : v(iv){}
511  };
512 
513  template<typename T>
514  inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
515 
516 } // namespace mathSSE
517 
518 #ifdef CMS_USE_SSE
519 
520 
521 //float op
522 
523 
525  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
526 }
527 
529  return _mm_cmpeq_ps(a.vec,b.vec);
530 }
531 
533  return _mm_cmpgt_ps(a.vec,b.vec);
534 }
535 
536 #ifdef __SSE3__
538  return _mm_hadd_ps(a.vec,b.vec);
539 }
540 #endif
541 
542 
544  const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
545  return _mm_xor_ps(a.vec,neg);
546 }
547 
549  return _mm_and_ps(a.vec,b.vec);
550 }
552  return _mm_or_ps(a.vec,b.vec);
553 }
555  return _mm_xor_ps(a.vec,b.vec);
556 }
558  return _mm_andnot_ps(a.vec,b.vec);
559 }
560 
561 
563  return _mm_add_ps(a.vec,b.vec);
564 }
565 
567  return _mm_sub_ps(a.vec,b.vec);
568 }
569 
571  return _mm_mul_ps(a.vec,b.vec);
572 }
573 
575  return _mm_div_ps(a.vec,b.vec);
576 }
577 
579  return _mm_min_ps(a.vec,b.vec);
580 }
581 
583  return _mm_max_ps(a.vec,b.vec);
584 }
585 
586 
587 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
588  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
589 }
590 
591 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
592  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
593 }
594 
595 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b,float a) {
596  return _mm_div_ps(b.vec,_mm_set1_ps(a));
597 }
598 
599 
600 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
601  using mathSSE::_mm_dot_ps;
602  float s;
603  _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
604  return s;
605 }
606 
608  using mathSSE::_mm_cross_ps;
609  return _mm_cross_ps(a.vec,b.vec);
610 }
611 
612 
613 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
614  mathSSE::Vec4F mul = a*b;
615 #ifdef __SSE3__
616  mul = hadd(mul,mul);
617 #else
618  __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
619  mul.vec = _mm_add_ps(mul.vec, swp);
620 #endif
621  float s;
622  _mm_store_ss(&s,mul.vec);
623  return s;
624 }
625 
626 
627 //
628 // float op 2d (use the 4d one...)
629 //
631  return -mathSSE::Vec4F(a);
632 }
633 
634 
636  return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
637 }
638 
640  return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
641 }
642 
644  return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
645 }
646 
648  return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
649 }
650 
652  return min(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
653 }
654 
657 }
658 
659 
660 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
661  return s*mathSSE::Vec4F(a);
662 }
663 
664 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) {
665  return s*mathSSE::Vec4F(a);
666 }
667 
668 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) {
669  return mathSSE::Vec4F(a)/s;
670 }
671 
672 
673 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
674 
675 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
676  return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
677 }
678 
679 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
680 
681 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
682  return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
683 }
684 
685 
687 // double op 2d
688 //
689 
690 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
691 #ifdef CMS_USE_AVX
692  vec = _mm256_castpd256_pd128(v4.vec);
693 #else
694  vec = v4.vec[0];
695 #endif
696 }
697 
698 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
699  arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
700 }
701 
703  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
704  return _mm_xor_pd(a.vec,neg);
705 }
706 
707 
709  return _mm_and_pd(a.vec,b.vec);
710 }
712  return _mm_or_pd(a.vec,b.vec);
713 }
715  return _mm_xor_pd(a.vec,b.vec);
716 }
718  return _mm_andnot_pd(a.vec,b.vec);
719 }
720 
721 #ifdef __SSE3__
723  return _mm_hadd_pd(a.vec,b.vec);
724 }
725 #endif
726 
728  return _mm_add_pd(a.vec,b.vec);
729 }
730 
732  return _mm_sub_pd(a.vec,b.vec);
733 }
734 
736  return _mm_mul_pd(a.vec,b.vec);
737 }
738 
740  return _mm_div_pd(a.vec,b.vec);
741 }
742 
743 
745  return _mm_min_pd(a.vec,b.vec);
746 }
747 
749  return _mm_max_pd(a.vec,b.vec);
750 }
751 
752 
753 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
754  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
755 }
756 
757 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
758  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
759 }
760 
761 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b,double a) {
762  return _mm_div_pd(b.vec,_mm_set1_pd(a));
763 }
764 
765 
766 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
767 
768 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
769  __m128d res = _mm_mul_pd ( a.vec, b.vec);
770 #ifdef __SSE3__
771  res = _mm_hadd_pd(res,res);
772 #else
773  res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
774 #endif
775  double s;
776  _mm_store_sd(&s,res);
777  return s;
778 }
779 
780 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
781 
782 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
783  __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
784  res = _mm_mul_pd ( a.vec , res );
785  res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
786  double s;
787  _mm_store_sd(&s,res);
788  return s;
789 }
790 
791 
792 #ifndef CMS_USE_AVX
793 // double op 3d
794 
795 
796 #ifdef __SSE3__
797 // consistent with AVX...
800 }
801 #endif
802 
803 
804 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
805  return
806  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
807  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
808 }
809 
811  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
812  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
813 }
814 
815 
817  return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
818 }
819 
820 
822  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
823 }
825  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
826 }
828  return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
829 }
830 
831 
833  using namespace mathSSE;
835 }
836 
838  using namespace mathSSE;
840 }
841 
842 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
843  __m128d res = _mm_set1_pd(a);
844  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
845 }
846 
847 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
848  __m128d res = _mm_set1_pd(a);
849  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
850 }
851 
852 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
853  a = 1./a;
854  return a*b;
855 }
856 
857 // mix algebra (creates ambiguities...)
858 /*
859 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
860  return a + mathSSE::Vec4D(b);
861 }
862 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
863  return a + mathSSE::Vec4D(b);
864 }
865 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
866  return a - mathSSE::Vec4D(b);
867 }
868 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
869  return a - mathSSE::Vec4D(b);
870 }
871 */
872 
873 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
874 
875 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
876  __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
877  _mm_mul_sd ( a.vec[1], b.vec[1])
878  );
879  res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
880  double s;
881  _mm_store_sd(&s,res);
882  return s;
883 }
884 
885 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
886 
888  const __m128i neg = _mm_set_epi64x( 0, 0x8000000000000000 );
889  // lh .z * rh .x , lh .z * rh .y
890  __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
891  // rh .z * lh .x , rh .z * lh .y
892  __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
893  __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
894  m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
895  m1 = __m128d(_mm_xor_si128 ( __m128i(m1) , neg )); // change the sign of the first element
896  // lh .x * rh .y , lh .y * rh .x
897  l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
898  // lh .x * rh .y - lh .y * rh .x
899  __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
900  return mathSSE::Vec4D(m1, m2);
901 }
902 
903 
904 inline double
905 __attribute__((always_inline)) __attribute__ ((pure))
906 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
907  return dot(a.xy(),b.xy());
908 }
909 
910 #endif // CMS_USE_AVX
911 
912 
913 // sqrt
914 namespace mathSSE {
915  template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
916  template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
917  template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
918 #ifdef CMS_USE_AVX
919  template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
920 #else
921  template<> inline Vec4D sqrt(Vec4D v) {
922  return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
923  }
924 #endif
925 }
926 
927 #endif // CMS_USE_SSE
928 
929 
930 #include <iosfwd>
931 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
932 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
933 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
934 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
935 
936 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
937 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
938 
939 
940 #endif // DataFormat_Math_SSEVec_H
Definition: AVXVec.h:6
T & operator[](unsigned int n)
Definition: SSEVec.h:150
Vec4 get1() const
Definition: SSEVec.h:180
void set1(float f1)
Definition: SSEVec.h:176
Vec4< double > Vec4D
Definition: SSEVec.h:505
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:128
Vec4< float > Vec3F
Definition: SSEVec.h:502
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:122
Vec2< double > xy() const
Definition: AVXVec.h:84
Vec4< float > Vec4F
Definition: SSEVec.h:501
ExtVec< T, 4 > Vec4
Definition: ExtVec.h:62
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:119
T arr[2]
Definition: SSEVec.h:159
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:104
OldVec< T > o
Definition: SSEVec.h:195
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
Definition: Electron.h:6
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
Definition: SSEVec.cc:16
T operator[](unsigned int n) const
Definition: SSEVec.h:154
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:167
As3D< T > as3D(Vec4< T > const &v)
Definition: SSEVec.h:514
Vec2< T > zw() const
Definition: SSEVec.h:190
T sqrt(T t)
Definition: SSEVec.h:18
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
T operator[](int i) const
Vec4< double > Vec3D
Definition: SSEVec.h:504
T min(T a, T b)
Definition: MathUtil.h:58
struct mathSSE::Rot3 __attribute__
Vec4(float f1)
Definition: SSEVec.h:170
#define N
Definition: blowfish.cc:9
Vec2< float > Vec2F
Definition: SSEVec.h:500
ExtVec< T, 2 > Vec2
Definition: ExtVec.h:63
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:510
Vec2(T f1, T f2)
Definition: SSEVec.h:120
Vec2 get1() const
Definition: SSEVec.h:132
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
Vec2(Vec2< U > v)
Definition: SSEVec.h:143
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
double a
Definition: hdecay.h:121
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:125
auto zw(V v) -> Vec2< typename std::remove_reference< decltype(v[0])>::type >
Definition: ExtVec.h:75
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec4< T > const & v
Definition: SSEVec.h:509
long double T
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:108
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:100
Vec2< double > Vec2D
Definition: SSEVec.h:503
Vec2< T > xy() const
Definition: SSEVec.h:189
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
Vec2(T f1)
Definition: SSEVec.h:123