CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__)
5 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
6 #include <x86intrin.h>
7 #define CMS_USE_SSE
8 #ifdef __AVX__
9 #define CMS_USE_AVX
10 #endif /* __AVX__ */
11 #else /* defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4) */
12 
13 #ifdef __SSE2__
14 #define CMS_USE_SSE
15 
16 #include <mmintrin.h>
17 #include <emmintrin.h>
18 #endif /* __SSE2__ */
19 #ifdef __SSE3__
20 #include <pmmintrin.h>
21 #endif /* __SSE3__ */
22 #ifdef __SSE4_1__
23 #include <smmintrin.h>
24 #endif /* __SSE4_1__ */
25 
26 #endif /* defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4) */
27 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
28 
29 #include<cmath>
30 
31 // needed fro gcc < 4.6
32 namespace mathSSE {
33  struct ZeroUpper {
35 #ifdef __AVX__
36  _mm256_zeroupper();
37 #endif
38  }
40 #ifdef __AVX__
41  _mm256_zeroupper();
42 #endif
43  }
44  };
45 }
46 
47 namespace mathSSE {
48  template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
49 }
50 
51 
52 namespace mathSSE {
53 #ifdef CMS_USE_SSE
54  //dot
55  inline __m128
56  __attribute__((always_inline)) __attribute__ ((pure))
57  _mm_dot_ps(__m128 v1, __m128 v2) {
58 #ifdef __SSE4_1__
59  return _mm_dp_ps(v1, v2, 0xff);
60 #else
61  __m128 mul = _mm_mul_ps(v1, v2);
62 #ifdef __SSE3__
63  mul = _mm_hadd_ps(mul,mul);
64  return _mm_hadd_ps(mul,mul);
65 #else
66  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
67  mul = _mm_add_ps(mul, swp);
68  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
69  return _mm_add_ps(mul, swp);
70 #endif
71 #endif
72  }
73 
74 
75  // cross (just 3x3)
76  inline __m128
77  __attribute__((always_inline)) __attribute__ ((pure))
78  _mm_cross_ps(__m128 v1, __m128 v2) {
79  // same order is _MM_SHUFFLE(3,2,1,0)
80  // x2, z1,z1
81  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
82  // y1, x2,y2
83  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
84 
85  __m128 v5 = _mm_mul_ps(v3, v4);
86 
87  // x1, z2,z2
88  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
89  // y2, x1,y1
90  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
91 
92  v3 = _mm_mul_ps(v3, v4);
93  const __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
94  return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
95  }
96 #endif // CMS_USE_SSE
97 
98 #ifdef CMS_USE_AVX
99  inline __m256d
100  __attribute__((always_inline)) __attribute__ ((pure))
101  _mm256_dot_pd(__m256d v1, __m256d v2) {
102  __m256d mul = _mm256_mul_pd(v1, v2);
103  mul = _mm256_hadd_pd(mul,mul);
104  __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
105  return _mm256_add_pd(mul,tmp);
106  }
107 
108  inline __m256d
109  __attribute__((always_inline)) __attribute__ ((pure))
110  _mm256_cross_pd(__m256d v1, __m256d v2) {
111 
112  __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
113  v3 = _mm256_permute_pd(v3,0);
114 
115  __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
116  v4 = _mm256_permute_pd(v4,5);
117 
118  __m256d v5 = _mm256_mul_pd(v3, v4);
119 
120  v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
121  v3 = _mm256_permute_pd(v3,0);
122 
123  v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
124  v4 = _mm256_permute_pd(v4,5);
125 
126  v3 = _mm256_mul_pd(v3, v4);
127  const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
128  return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
129 
130  }
131 
132 #endif // CMS_USE_AVX
133 
134 
135 
136  template<typename T>
137  struct OldVec { T theX; T theY; T theZ; T theW;} __attribute__ ((aligned (16)));
138 #ifdef CMS_USE_AVX
139  template<>
140  struct OldVec<double> { double theX; double theY; double theZ; double theW;} __attribute__ ((aligned (32)));
141 #endif
142 
143  template<typename T> union Vec4;
144 
145  template<typename T> union Vec2{
146  Vec2() {
147  arr[0] = 0; arr[1] = 0;
148  }
149  Vec2(T f1, T f2) {
150  arr[0] = f1; arr[1] = f2;
151  }
152  explicit Vec2(T f1) {
153  arr[0] = f1; arr[1] = f1;
154  }
155 
156  void set(T f1, T f2) {
157  arr[0] = f1; arr[1] = f2;
158  }
159 
160  template <int N>
161  Vec2 get1() const {
162  return Vec2(arr[N],arr[N]);
163  }
164 
165  /*
166  Vec2 get1(unsigned int n) const {
167  return Vec2(arr[n],arr[n]);
168  }
169  */
170 
171  template<typename U>
173  arr[0] = v[0]; arr[1] = v[1];
174 
175  }
176 
177  inline Vec2(Vec4<T> v4);
178 
179  T & operator[](unsigned int n) {
180  return arr[n];
181  }
182 
183  T operator[](unsigned int n) const {
184  return arr[n];
185  }
186 
187 
188  T arr[2];
189  };
190 
191 
192  template<typename T> union Vec4{
193  Vec4() {
194  arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
195  }
196  Vec4(float f1, float f2, float f3, float f4=0) {
197  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
198  }
199  explicit Vec4(float f1) {
200  set1(f1);
201  }
202  void set(float f1, float f2, float f3, float f4=0) {
203  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
204  }
205  void set1(float f1) {
206  arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
207  }
208  template <int N>
209  Vec4 get1() const {
210  return Vec4(arr[N],arr[N],arr[N],arr[N]);
211  }
212  /*
213  Vec4 get1(unsigned int n) const {
214  return Vec4(arr[n],arr[n],arr[n],arr[n]);
215  }
216  */
217 
218  Vec2<T> xy() const { return Vec2<T>(arr[0],arr[1]);}
219  Vec2<T> zw() const { return Vec2<T>(arr[2],arr[3]);}
220 
221 
222 
223  T __attribute__ ((aligned(16))) arr[4];
225  };
226 
227  template<typename T>
228  inline Vec2<T>::Vec2(Vec4<T> v4) {
229  arr[0]=v4[0];arr[1]=v4[1];
230  }
231 
232 
233 #ifdef CMS_USE_SSE
234 
235  template<>
236  union Vec2<double>;
237  template<>
238  union Vec4<double>;
239 
240  template<typename T> union Mask2{};
241  template<typename T> union Mask4{};
242 
243  template<>
244  union Mask4<float> {
245  __m128 vec;
246  unsigned int __attribute__ ((aligned(16))) mask[4];
247  Mask4() {vec = _mm_setzero_ps();}
248  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
249  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
250  }
251  };
252 
253 #ifdef CMS_USE_AVX
254  template<>
255  union Mask4<double> {
256  __m256d vec;
257  unsigned long long __attribute__ ((aligned(32))) mask[4];
258  Mask4() {
259  vec = _mm256_setzero_pd();
260  }
261  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
262  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
263  }
264  };
265 #else
266  template<>
267  union Mask4<double> {
268  __m128d vec[2];
269  unsigned long long __attribute__ ((aligned(16))) mask[4];
270  Mask4() {
271  vec[0] = _mm_setzero_pd();
272  vec[1] = _mm_setzero_pd();
273  }
274  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
275  mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
276  }
277  };
278 #endif
279 
280  template<>
281  union Mask2<double> {
282  __m128d vec;
283  unsigned long long __attribute__ ((aligned(16))) mask[2];
284  Mask2() {
285  vec = _mm_setzero_pd();
286  }
287  Mask2(unsigned long long m1, unsigned long long m2) {
288  mask[0]=m1; mask[1]=m2;
289  }
290  };
291 
292  template<>
293  union Vec4<float> {
294  typedef __m128 nativeType;
295  __m128 vec;
296  float __attribute__ ((aligned(16))) arr[4];
297  OldVec<float> o;
298 
299  Vec4(__m128 ivec) : vec(ivec) {}
300 
301  Vec4(OldVec<float> const & ivec) : o(ivec) {}
302 
303  Vec4() {
304  vec = _mm_setzero_ps();
305  }
306 
307 
308  inline Vec4(Vec4<double> ivec);
309 
310  explicit Vec4(float f1) {
311  set1(f1);
312  }
313 
314  Vec4(float f1, float f2, float f3, float f4=0) {
315  vec = _mm_set_ps(f4, f3, f2, f1);
316  }
317 
318 
319  Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
320  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
321  arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
322  }
323 
324  Vec4( Vec2<float> ivec0, float f3, float f4=0) {
325  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
326  arr[2] = f3; arr[3] = f4;
327  }
328 
329  Vec4( Vec2<float> ivec0) {
330  vec = _mm_setzero_ps();
331  arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
332  }
333 
334 
335  // for masking
336  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
337  Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
338  }
339 
340  void set(float f1, float f2, float f3, float f4=0) {
341  vec = _mm_set_ps(f4, f3, f2, f1);
342  }
343 
344  void set1(float f1) {
345  vec = _mm_set1_ps(f1);
346  }
347 
348  template <int N>
349  Vec4 get1() const {
350  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
351  }
352 
353  float & operator[](unsigned int n) {
354  return arr[n];
355  }
356 
357  float operator[](unsigned int n) const {
358  return arr[n];
359  }
360 
361  Vec2<float> xy() const { return Vec2<float>(arr[0],arr[1]);}
362  Vec2<float> zw() const { return Vec2<float>(arr[2],arr[3]);}
363 
364  };
365 
366 
367  template<>
368  union Vec2<double> {
369  typedef __m128d nativeType;
370  __m128d vec;
371  double __attribute__ ((aligned(16))) arr[2];
372 
373  Vec2(__m128d ivec) : vec(ivec) {}
374 
375  Vec2() {
376  vec = _mm_setzero_pd();
377  }
378 
379 
380  inline Vec2(Vec2<float> ivec);
381 
382 
383  Vec2(double f1, double f2) {
384  vec = _mm_set_pd(f2,f1);
385  }
386 
387  explicit Vec2(double f1) {
388  set1(f1);
389  }
390 
391  inline Vec2(Vec4<double> v4);
392 
393  // for masking
394  void setMask(unsigned long long m1, unsigned long long m2) {
395  Mask2<double> mask(m1,m2); vec=mask.vec;
396  }
397 
398 
399  void set(double f1, double f2) {
400  vec = _mm_set_pd(f2,f1);
401  }
402 
403  void set1(double f1) {
404  vec = _mm_set1_pd(f1);
405  }
406 
407  template <int N>
408  Vec2 get1() const {
409  return Vec2(arr[N],arr[N]);
410  }
411  /*
412  Vec2 get1(unsigned int n) const {
413  return Vec2(arr[n],arr[n]);
414  }
415  */
416  double & operator[](unsigned int n) {
417  return arr[n];
418  }
419  double operator[](unsigned int n) const {
420  return arr[n];
421  }
422  };
423 
424 
425 
426 
427 #ifdef CMS_USE_AVX
428 }// namespace mathSSE
429 #include "AVXVec.h"
430 
431 namespace mathSSE {
432 #else
433  template<>
434  union Vec4<double> {
435  __m128d vec[2];
436  double __attribute__ ((aligned(16))) arr[4];
437  OldVec<double> o;
438 
439  Vec4(__m128d ivec[]) {
440  vec[0] = ivec[0];
441  vec[1] = ivec[1];
442  }
443 
444  Vec4(__m128d ivec0, __m128d ivec1) {
445  vec[0] = ivec0;
446  vec[1] = ivec1;
447  }
448 
449  Vec4() {
450  vec[0] = _mm_setzero_pd();
451  vec[1] = _mm_setzero_pd();
452  }
453 
454  Vec4( Vec4<float> ivec) {
455  vec[0] = _mm_cvtps_pd(ivec.vec);
456  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
457  }
458 
459  explicit Vec4(double f1) {
460  set1(f1);
461  }
462 
463  Vec4(double f1, double f2, double f3, double f4=0) {
464  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
465  }
466 
467  Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
468  vec[0] = ivec0.vec;
469  vec[1] = ivec1.vec;
470  }
471 
472  Vec4( Vec2<double> ivec0, double f3, double f4=0) {
473  vec[0] = ivec0.vec;
474  arr[2] = f3; arr[3] = f4;
475  }
476 
477  Vec4( Vec2<double> ivec0) {
478  vec[0] = ivec0.vec;
479  vec[1] = _mm_setzero_pd();
480  }
481 
482 
483  Vec4(OldVec<double> const & ivec) : o(ivec) {}
484 
485  // for masking
486  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
487  Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
488  }
489 
490 
491  void set(double f1, double f2, double f3, double f4=0) {
492  arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
493  }
494 
495  void set1(double f1) {
496  vec[0] = vec[1]= _mm_set1_pd(f1);
497  }
498 
499 
500  template<int N>
501  Vec4 get1() const {
502  return Vec4(arr[N],arr[N],arr[N],arr[N]);
503  }
504 
505  double & operator[](unsigned int n) {
506  return arr[n];
507  }
508 
509  double operator[](unsigned int n) const {
510  return arr[n];
511  }
512 
513  Vec2<double> xy() const { return vec[0];}
514  Vec2<double> zw() const { return vec[1];}
515 
516  };
517 
518 
519  inline Vec4<float>::Vec4(Vec4<double> ivec) {
520  vec = _mm_cvtpd_ps(ivec.vec[0]);
521  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
522  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
523  }
524 #endif // CMS_USE_AVX
525 
526 
527 #endif // CMS_USE_SSE
528 
535 
536  template<typename T>
537  struct As3D {
538  Vec4<T> const & v;
539  As3D(Vec4<T> const &iv ) : v(iv){}
540  };
541 
542  template<typename T>
543  inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
544 
545 } // namespace mathSSE
546 
547 #ifdef CMS_USE_SSE
548 
549 
550 //float op
551 
552 
554  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
555 }
556 
558  return _mm_cmpeq_ps(a.vec,b.vec);
559 }
560 
562  return _mm_cmpgt_ps(a.vec,b.vec);
563 }
564 
565 #ifdef __SSE3__
567  return _mm_hadd_ps(a.vec,b.vec);
568 }
569 #endif
570 
571 
573  const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
574  return _mm_xor_ps(a.vec,neg);
575 }
576 
578  return _mm_and_ps(a.vec,b.vec);
579 }
581  return _mm_or_ps(a.vec,b.vec);
582 }
584  return _mm_xor_ps(a.vec,b.vec);
585 }
587  return _mm_andnot_ps(a.vec,b.vec);
588 }
589 
590 
592  return _mm_add_ps(a.vec,b.vec);
593 }
594 
596  return _mm_sub_ps(a.vec,b.vec);
597 }
598 
600  return _mm_mul_ps(a.vec,b.vec);
601 }
602 
604  return _mm_div_ps(a.vec,b.vec);
605 }
606 
608  return _mm_min_ps(a.vec,b.vec);
609 }
610 
612  return _mm_max_ps(a.vec,b.vec);
613 }
614 
615 
616 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
617  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
618 }
619 
620 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
621  return _mm_mul_ps(_mm_set1_ps(a),b.vec);
622 }
623 
624 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b,float a) {
625  return _mm_div_ps(b.vec,_mm_set1_ps(a));
626 }
627 
628 
629 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
630  using mathSSE::_mm_dot_ps;
631  float s;
632  _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
633  return s;
634 }
635 
637  using mathSSE::_mm_cross_ps;
638  return _mm_cross_ps(a.vec,b.vec);
639 }
640 
641 
642 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
643  mathSSE::Vec4F mul = a*b;
644 #ifdef __SSE3__
645  mul = hadd(mul,mul);
646 #else
647  __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
648  mul.vec = _mm_add_ps(mul.vec, swp);
649 #endif
650  float s;
651  _mm_store_ss(&s,mul.vec);
652  return s;
653 }
654 
655 
656 //
657 // float op 2d (use the 4d one...)
658 //
660  return -mathSSE::Vec4F(a);
661 }
662 
663 
665  return mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
666 }
667 
669  return mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
670 }
671 
673  return mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
674 }
675 
677  return mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
678 }
679 
681  return min(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
682 }
683 
686 }
687 
688 
689 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
690  return s*mathSSE::Vec4F(a);
691 }
692 
693 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) {
694  return s*mathSSE::Vec4F(a);
695 }
696 
697 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) {
698  return mathSSE::Vec4F(a)/s;
699 }
700 
701 
702 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
703 
704 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
705  return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
706 }
707 
708 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__ ((pure));
709 
710 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
711  return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
712 }
713 
714 
716 // double op 2d
717 //
718 
719 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
720 #ifdef CMS_USE_AVX
721  vec = _mm256_castpd256_pd128(v4.vec);
722 #else
723  vec = v4.vec[0];
724 #endif
725 }
726 
727 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
728  arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
729 }
730 
732  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
733  return _mm_xor_pd(a.vec,neg);
734 }
735 
736 
738  return _mm_and_pd(a.vec,b.vec);
739 }
741  return _mm_or_pd(a.vec,b.vec);
742 }
744  return _mm_xor_pd(a.vec,b.vec);
745 }
747  return _mm_andnot_pd(a.vec,b.vec);
748 }
749 
750 #ifdef __SSE3__
752  return _mm_hadd_pd(a.vec,b.vec);
753 }
754 #endif
755 
757  return _mm_add_pd(a.vec,b.vec);
758 }
759 
761  return _mm_sub_pd(a.vec,b.vec);
762 }
763 
765  return _mm_mul_pd(a.vec,b.vec);
766 }
767 
769  return _mm_div_pd(a.vec,b.vec);
770 }
771 
772 
774  return _mm_min_pd(a.vec,b.vec);
775 }
776 
778  return _mm_max_pd(a.vec,b.vec);
779 }
780 
781 
782 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
783  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
784 }
785 
786 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
787  return _mm_mul_pd(_mm_set1_pd(a),b.vec);
788 }
789 
790 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b,double a) {
791  return _mm_div_pd(b.vec,_mm_set1_pd(a));
792 }
793 
794 
795 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
796 
797 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
798  __m128d res = _mm_mul_pd ( a.vec, b.vec);
799 #ifdef __SSE3__
800  res = _mm_hadd_pd(res,res);
801 #else
802  res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
803 #endif
804  double s;
805  _mm_store_sd(&s,res);
806  return s;
807 }
808 
809 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__ ((pure));
810 
811 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
812  __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
813  res = _mm_mul_pd ( a.vec , res );
814  res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
815  double s;
816  _mm_store_sd(&s,res);
817  return s;
818 }
819 
820 
821 #ifndef CMS_USE_AVX
822 // double op 3d
823 
824 
825 #ifdef __SSE3__
826 // consistent with AVX...
829 }
830 #endif
831 
832 
833 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
834  return
835  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
836  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
837 }
838 
840  const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
841  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
842 }
843 
844 
846  return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
847 }
848 
849 
851  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
852 }
854  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
855 }
857  return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
858 }
859 
860 
862  using namespace mathSSE;
864 }
865 
867  using namespace mathSSE;
869 }
870 
871 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
872  __m128d res = _mm_set1_pd(a);
873  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
874 }
875 
876 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
877  __m128d res = _mm_set1_pd(a);
878  return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
879 }
880 
881 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
882  a = 1./a;
883  return a*b;
884 }
885 
886 // mix algebra (creates ambiguities...)
887 /*
888 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
889  return a + mathSSE::Vec4D(b);
890 }
891 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
892  return a + mathSSE::Vec4D(b);
893 }
894 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
895  return a - mathSSE::Vec4D(b);
896 }
897 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
898  return a - mathSSE::Vec4D(b);
899 }
900 */
901 
902 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
903 
904 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
905  __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
906  _mm_mul_sd ( a.vec[1], b.vec[1])
907  );
908  res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
909  double s;
910  _mm_store_sd(&s,res);
911  return s;
912 }
913 
914 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
915 
916 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
917  const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
918  // lh .z * rh .x , lh .z * rh .y
919  __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
920  // rh .z * lh .x , rh .z * lh .y
921  __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
922  __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
923  m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
924  m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
925  // lh .x * rh .y , lh .y * rh .x
926  l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
927  // lh .x * rh .y - lh .y * rh .x
928  __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
929  return mathSSE::Vec4D(m1, m2);
930 }
931 
932 
933 inline double
934 __attribute__((always_inline)) __attribute__ ((pure))
935 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
936  return dot(a.xy(),b.xy());
937 }
938 
939 #endif // CMS_USE_AVX
940 
941 
942 // sqrt
943 namespace mathSSE {
944  template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
945  template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
946  template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
947 #ifdef CMS_USE_AVX
948  template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
949 #else
950  template<> inline Vec4D sqrt(Vec4D v) {
951  return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
952  }
953 #endif
954 }
955 
956 // chephes func
958 namespace mathSSE {
959  inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
960  inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
961  inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
962  inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
963  inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
964 
965  inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
966  inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
967  inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
968  inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
969  inline void sincos(float f, float & s, float & c) {
970  __m128 vs, vc;
971  sincos_ps(_mm_load_ss(&f),&vs, &vc);
972  _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
973  }
974 }
975 #endif // CMS_USE_SSE
976 
977 
978 #include <iosfwd>
979 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
980 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
981 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
982 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
983 
984 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
985 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
986 
987 
988 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
Definition: SSEVec.h:179
Vec4 get1() const
Definition: SSEVec.h:209
void set1(float f1)
Definition: SSEVec.h:205
Vec4< double > Vec4D
Definition: SSEVec.h:534
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:128
Vec4< float > Vec3F
Definition: SSEVec.h:531
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:122
Vec4< float > Vec4F
Definition: SSEVec.h:530
ExtVec< T, 4 > Vec4
Definition: ExtVec.h:23
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:119
auto zw(V v) -> Vec2< typenamestd::remove_reference< decltype(v[0])>::type >
Definition: ExtVec.h:36
Sin< T >::type sin(const T &t)
Definition: Sin.h:22
T arr[2]
Definition: SSEVec.h:188
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
void set(T f1, T f2)
Definition: SSEVec.h:156
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:104
OldVec< T > o
Definition: SSEVec.h:224
std::ostream & operator<<(std::ostream &out, const ALILine &li)
Definition: ALILine.cc:187
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
Vec4< double > Vec4D
Definition: ExtVec.h:126
T operator[](unsigned int n) const
Definition: SSEVec.h:183
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:196
Vec2< float > Vec2F
Definition: ExtVec.h:121
As3D< T > as3D(Vec4< T > const &v)
Definition: SSEVec.h:543
Vec2< T > zw() const
Definition: SSEVec.h:219
const T & max(const T &a, const T &b)
T sqrt(T t)
Definition: SSEVec.h:48
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
Cos< T >::type cos(const T &t)
Definition: Cos.h:22
T operator[](int i) const
Vec4< double > Vec3D
Definition: SSEVec.h:533
struct mathSSE::Rot3 __attribute__
tuple out
Definition: dbtoconf.py:99
Vec4(float f1)
Definition: SSEVec.h:199
#define N
Definition: blowfish.cc:9
Vec2< float > Vec2F
Definition: SSEVec.h:529
ExtVec< T, 2 > Vec2
Definition: ExtVec.h:24
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:539
Vec2(T f1, T f2)
Definition: SSEVec.h:149
a f
Definition: SIMDVec.h:35
Vec2 get1() const
Definition: SSEVec.h:161
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
Vec2(Vec2< U > v)
Definition: SSEVec.h:172
T dot(const Basic3DVector &v) const
Scalar product, or &quot;dot&quot; product, with a vector of same type.
double a
Definition: hdecay.h:121
Vec2< double > Vec2D
Definition: ExtVec.h:124
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:125
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec4< T > const & v
Definition: SSEVec.h:538
long double T
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:108
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:100
def template
Definition: svgfig.py:520
T __attribute__((aligned(16))) arr[4]
Vec2< double > Vec2D
Definition: SSEVec.h:532
void set(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:202
Vec2< T > xy() const
Definition: SSEVec.h:218
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or &quot;cross&quot; product, with a vector of same type.
Vec2(T f1)
Definition: SSEVec.h:152