CMS 3D CMS Logo

SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
5  !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) && !defined(__riscv)
6 #if defined(__GNUC__)
7 #include <x86intrin.h>
8 #define CMS_USE_SSE
9 #ifdef __SSE4_1__
10 // slower than sse3
11 // #define CMS_USE_SSE4
12 #endif
13 #ifdef __AVX2__
14 #define CMS_USE_AVX2
15 #endif /* __AVX2__ */
16 #endif /* defined(__GNUC__) */
17 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
18 
19 #include <cmath>
20 
21 namespace mathSSE {
22  template <typename T>
23  inline T sqrt(T t) {
24  return std::sqrt(t);
25  }
26 } // namespace mathSSE
27 
28 namespace mathSSE {
29 #ifdef CMS_USE_SSE
30  //dot
31  inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
32 #ifdef CMS_USE_SSE4
33  return _mm_dp_ps(v1, v2, 0xff);
35 #else
36 
37  __m128 mul = _mm_mul_ps(v1, v2);
38 #ifdef __SSE3__
39  mul = _mm_hadd_ps(mul, mul);
40  return _mm_hadd_ps(mul, mul);
41 #else
42  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
43  mul = _mm_add_ps(mul, swp);
44  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
45  return _mm_add_ps(mul, swp);
46 #endif
47 #endif
48  }
49 
50  // cross (just 3x3)
51  inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
52  // same order is _MM_SHUFFLE(3,2,1,0)
53  // x2, z1,z1
54  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
55  // y1, x2,y2
56  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
57 
58  __m128 v5 = _mm_mul_ps(v3, v4);
59 
60  // x1, z2,z2
61  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
62  // y2, x1,y1
63  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
64 
65  v3 = _mm_mul_ps(v3, v4);
66  const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
67  __m128i ret = __m128i(_mm_sub_ps(v5, v3));
68  return __m128(_mm_xor_si128(ret, neg));
69  }
70 #endif // CMS_USE_SSE
71 
72 #ifdef CMS_USE_AVX2
73  inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
74  __m256d mul = _mm256_mul_pd(v1, v2);
75  mul = _mm256_hadd_pd(mul, mul);
76  __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
77  return _mm256_add_pd(mul, tmp);
78  }
79 
80  inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
81  __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
82  v3 = _mm256_permute_pd(v3, 0);
83 
84  __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
85  v4 = _mm256_permute_pd(v4, 5);
86 
87  __m256d v5 = _mm256_mul_pd(v3, v4);
88 
89  v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
90  v3 = _mm256_permute_pd(v3, 0);
91 
92  v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
93  v4 = _mm256_permute_pd(v4, 5);
94 
95  v3 = _mm256_mul_pd(v3, v4);
96  __m256d ret = _mm256_sub_pd(v5, v3);
97  const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
98  return __m256d(_mm256_xor_si256(__m256i(ret), neg));
99  }
100 
101 #endif // CMS_USE_AVX2
102 
103  template <typename T>
104  struct OldVec {
109  } __attribute__((aligned(16)));
110 #ifdef CMS_USE_AVX2
111  template <>
112  struct OldVec<double> {
113  double theX;
114  double theY;
115  double theZ;
116  double theW;
117  } __attribute__((aligned(32)));
118 #endif
119 
120  template <typename T>
121  union Vec4;
122 
123  template <typename T>
124  union Vec2 {
125  Vec2() {
126  arr[0] = 0;
127  arr[1] = 0;
128  }
129  Vec2(T f1, T f2) {
130  arr[0] = f1;
131  arr[1] = f2;
132  }
133  explicit Vec2(T f1) {
134  arr[0] = f1;
135  arr[1] = f1;
136  }
137 
138  void set(T f1, T f2) {
139  arr[0] = f1;
140  arr[1] = f2;
141  }
142 
143  template <int N>
144  Vec2 get1() const {
145  return Vec2(arr[N], arr[N]);
146  }
147 
148  /*
149  Vec2 get1(unsigned int n) const {
150  return Vec2(arr[n],arr[n]);
151  }
152  */
153 
154  template <typename U>
156  arr[0] = v[0];
157  arr[1] = v[1];
158  }
159 
160  inline Vec2(Vec4<T> v4);
161 
162  T &operator[](unsigned int n) { return arr[n]; }
163 
164  T operator[](unsigned int n) const { return arr[n]; }
165 
166  T arr[2];
167  };
168 
169  template <typename T>
170  union Vec4 {
171  Vec4() {
172  arr[0] = 0;
173  arr[1] = 0;
174  arr[2] = 0;
175  arr[3] = 0;
176  }
177  Vec4(float f1, float f2, float f3, float f4 = 0) {
178  arr[0] = f1;
179  arr[1] = f2;
180  arr[2] = f3;
181  arr[3] = f4;
182  }
183  explicit Vec4(float f1) { set1(f1); }
184  void set(float f1, float f2, float f3, float f4 = 0) {
185  arr[0] = f1;
186  arr[1] = f2;
187  arr[2] = f3;
188  arr[3] = f4;
189  }
190  void set1(float f1) {
191  arr[0] = f1;
192  arr[1] = f1;
193  arr[2] = f1;
194  arr[3] = f1;
195  }
196  template <int N>
197  Vec4 get1() const {
198  return Vec4(arr[N], arr[N], arr[N], arr[N]);
199  }
200  /*
201  Vec4 get1(unsigned int n) const {
202  return Vec4(arr[n],arr[n],arr[n],arr[n]);
203  }
204  */
205 
206  Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
207  Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
208 
209  T __attribute__((aligned(16))) arr[4];
211  };
212 
213  template <typename T>
214  inline Vec2<T>::Vec2(Vec4<T> v4) {
215  arr[0] = v4[0];
216  arr[1] = v4[1];
217  }
218 
219 #ifdef CMS_USE_SSE
220 
221  template <>
222  union Vec2<double>;
223  template <>
224  union Vec4<double>;
225 
226  template <typename T>
227  union Mask2 {};
228  template <typename T>
229  union Mask4 {};
230 
231  template <>
232  union Mask4<float> {
233  __m128 vec;
234  unsigned int __attribute__((aligned(16))) mask[4];
235  Mask4() { vec = _mm_setzero_ps(); }
236  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
237  mask[0] = m1;
238  mask[1] = m2;
239  mask[2] = m3;
240  mask[3] = m4;
241  }
242  };
243 
244 #ifdef CMS_USE_AVX2
245  template <>
246  union Mask4<double> {
247  __m256d vec;
248  unsigned long long __attribute__((aligned(32))) mask[4];
249  Mask4() { vec = _mm256_setzero_pd(); }
250  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
251  mask[0] = m1;
252  mask[1] = m2;
253  mask[2] = m3;
254  mask[3] = m4;
255  }
256  };
257 #else
258  template <>
259  union Mask4<double> {
260  __m128d vec[2];
261  unsigned long long __attribute__((aligned(16))) mask[4];
262  Mask4() {
263  vec[0] = _mm_setzero_pd();
264  vec[1] = _mm_setzero_pd();
265  }
266  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
267  mask[0] = m1;
268  mask[1] = m2;
269  mask[2] = m3;
270  mask[3] = m4;
271  }
272  };
273 #endif
274 
275  template <>
276  union Mask2<double> {
277  __m128d vec;
278  unsigned long long __attribute__((aligned(16))) mask[2];
279  Mask2() { vec = _mm_setzero_pd(); }
280  Mask2(unsigned long long m1, unsigned long long m2) {
281  mask[0] = m1;
282  mask[1] = m2;
283  }
284  };
285 
286  template <>
287  union Vec4<float> {
288  typedef __m128 nativeType;
289  __m128 vec;
290  float __attribute__((aligned(16))) arr[4];
291  OldVec<float> o;
292 
293  Vec4(__m128 ivec) : vec(ivec) {}
294 
295  Vec4(OldVec<float> const &ivec) : o(ivec) {}
296 
297  Vec4() { vec = _mm_setzero_ps(); }
298 
299  inline Vec4(Vec4<double> ivec);
300 
301  explicit Vec4(float f1) { set1(f1); }
302 
303  Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
304 
305  Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
306  arr[0] = ivec0.arr[0];
307  arr[1] = ivec0.arr[1];
308  arr[2] = ivec1.arr[0];
309  arr[3] = ivec1.arr[1];
310  }
311 
312  Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
313  arr[0] = ivec0.arr[0];
314  arr[1] = ivec0.arr[1];
315  arr[2] = f3;
316  arr[3] = f4;
317  }
318 
319  Vec4(Vec2<float> ivec0) {
320  vec = _mm_setzero_ps();
321  arr[0] = ivec0.arr[0];
322  arr[1] = ivec0.arr[1];
323  }
324 
325  // for masking
326  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
327  Mask4<float> mask(m1, m2, m3, m4);
328  vec = mask.vec;
329  }
330 
331  void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
332 
333  void set1(float f1) { vec = _mm_set1_ps(f1); }
334 
335  template <int N>
336  Vec4 get1() const {
337  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
338  }
339 
340  float &operator[](unsigned int n) { return arr[n]; }
341 
342  float operator[](unsigned int n) const { return arr[n]; }
343 
344  Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
345  Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
346  };
347 
348  template <>
349  union Vec2<double> {
350  typedef __m128d nativeType;
351  __m128d vec;
352  double __attribute__((aligned(16))) arr[2];
353 
354  Vec2(__m128d ivec) : vec(ivec) {}
355 
356  Vec2() { vec = _mm_setzero_pd(); }
357 
358  inline Vec2(Vec2<float> ivec);
359 
360  Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
361 
362  explicit Vec2(double f1) { set1(f1); }
363 
364  inline Vec2(Vec4<double> v4);
365 
366  // for masking
367  void setMask(unsigned long long m1, unsigned long long m2) {
368  Mask2<double> mask(m1, m2);
369  vec = mask.vec;
370  }
371 
372  void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
373 
374  void set1(double f1) { vec = _mm_set1_pd(f1); }
375 
376  template <int N>
377  Vec2 get1() const {
378  return Vec2(arr[N], arr[N]);
379  }
380  /*
381  Vec2 get1(unsigned int n) const {
382  return Vec2(arr[n],arr[n]);
383  }
384  */
385  double &operator[](unsigned int n) { return arr[n]; }
386  double operator[](unsigned int n) const { return arr[n]; }
387  };
388 
389 #ifdef CMS_USE_AVX2
390 } // namespace mathSSE
391 #include "private/AVXVec.h"
392 
393 namespace mathSSE {
394 #else
395  template <>
396  union Vec4<double> {
397  __m128d vec[2];
398  double __attribute__((aligned(16))) arr[4];
399  OldVec<double> o;
400 
401  Vec4(__m128d ivec[]) {
402  vec[0] = ivec[0];
403  vec[1] = ivec[1];
404  }
405 
406  Vec4(__m128d ivec0, __m128d ivec1) {
407  vec[0] = ivec0;
408  vec[1] = ivec1;
409  }
410 
411  Vec4() {
412  vec[0] = _mm_setzero_pd();
413  vec[1] = _mm_setzero_pd();
414  }
415 
416  Vec4(Vec4<float> ivec) {
417  vec[0] = _mm_cvtps_pd(ivec.vec);
418  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
419  }
420 
421  explicit Vec4(double f1) { set1(f1); }
422 
423  Vec4(double f1, double f2, double f3, double f4 = 0) {
424  arr[0] = f1;
425  arr[1] = f2;
426  arr[2] = f3;
427  arr[3] = f4;
428  }
429 
430  Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
431  vec[0] = ivec0.vec;
432  vec[1] = ivec1.vec;
433  }
434 
435  Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
436  vec[0] = ivec0.vec;
437  arr[2] = f3;
438  arr[3] = f4;
439  }
440 
441  Vec4(Vec2<double> ivec0) {
442  vec[0] = ivec0.vec;
443  vec[1] = _mm_setzero_pd();
444  }
445 
446  Vec4(OldVec<double> const &ivec) : o(ivec) {}
447 
448  // for masking
449  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
450  Mask4<double> mask(m1, m2, m3, m4);
451  vec[0] = mask.vec[0];
452  vec[1] = mask.vec[1];
453  }
454 
455  void set(double f1, double f2, double f3, double f4 = 0) {
456  arr[0] = f1;
457  arr[1] = f2;
458  arr[2] = f3;
459  arr[3] = f4;
460  }
461 
462  void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
463 
464  template <int N>
465  Vec4 get1() const {
466  return Vec4(arr[N], arr[N], arr[N], arr[N]);
467  }
468 
469  double &operator[](unsigned int n) { return arr[n]; }
470 
471  double operator[](unsigned int n) const { return arr[n]; }
472 
473  Vec2<double> xy() const { return vec[0]; }
474  Vec2<double> zw() const { return vec[1]; }
475  };
476 
477  inline Vec4<float>::Vec4(Vec4<double> ivec) {
478  vec = _mm_cvtpd_ps(ivec.vec[0]);
479  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
480  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
481  }
482 #endif // CMS_USE_AVX2
483 
484 #endif // CMS_USE_SSE
485 
492 
493  template <typename T>
494  struct As3D {
495  Vec4<T> const &v;
496  As3D(Vec4<T> const &iv) : v(iv) {}
497  };
498 
499  template <typename T>
500  inline As3D<T> as3D(Vec4<T> const &v) {
501  return v;
502  }
503 
504 } // namespace mathSSE
505 
506 #ifdef CMS_USE_SSE
507 
508 //float op
509 
511  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
512 }
513 
514 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
515 
516 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
517 
518 #ifdef __SSE3__
519 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
520 #endif
521 
523  const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
524  return _mm_xor_ps(a.vec, neg);
525 }
526 
527 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
528 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
529 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
530 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
531 
532 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
533 
534 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
535 
536 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
537 
538 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
539 
540 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
541 
542 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
543 
544 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
545 
546 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
547 
548 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
549 
550 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
551  using mathSSE::_mm_dot_ps;
552  float s;
553  _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
554  return s;
555 }
556 
557 inline float dot3(mathSSE::Vec4F a, mathSSE::Vec4F b) { return dot(a, b); }
558 
560  using mathSSE::_mm_cross_ps;
561  return _mm_cross_ps(a.vec, b.vec);
562 }
563 
564 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
565  mathSSE::Vec4F mul = a * b;
566 #ifdef __SSE3__
567  mul = hadd(mul, mul);
568 #else
569  __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
570  mul.vec = _mm_add_ps(mul.vec, swp);
571 #endif
572  float s;
573  _mm_store_ss(&s, mul.vec);
574  return s;
575 }
576 
577 //
578 // float op 2d (use the 4d one...)
579 //
581 
583 
585 
587 
589 
591 
593 
594 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
595 
596 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
597 
598 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
599 
600 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
601 
602 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
603 
604 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
605 
606 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
607 
609 // double op 2d
610 //
611 
612 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
613 #ifdef CMS_USE_AVX2
614  vec = _mm256_castpd256_pd128(v4.vec);
615 #else
616  vec = v4.vec[0];
617 #endif
618 }
619 
620 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
621  arr[0] = ivec.arr[0];
622  arr[1] = ivec.arr[1];
623 }
624 
626  const __m128d neg = _mm_set_pd(-0.0, -0.0);
627  return _mm_xor_pd(a.vec, neg);
628 }
629 
630 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
631 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
632 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
633 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
634 
635 #ifdef __SSE3__
636 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
637 #endif
638 
639 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
640 
641 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
642 
643 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
644 
645 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
646 
647 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
648 
649 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
650 
651 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
652 
653 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
654 
655 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
656 
657 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
658 
659 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
660  __m128d res = _mm_mul_pd(a.vec, b.vec);
661 #ifdef __SSE3__
662  res = _mm_hadd_pd(res, res);
663 #else
664  res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
665 #endif
666  double s;
667  _mm_store_sd(&s, res);
668  return s;
669 }
670 
671 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
672 
673 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
674  __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
675  res = _mm_mul_pd(a.vec, res);
676  res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
677  double s;
678  _mm_store_sd(&s, res);
679  return s;
680 }
681 
682 #ifndef CMS_USE_AVX2
683 // double op 3d
684 
685 #ifdef __SSE3__
686 // consistent with AVX...
688  return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
689  hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
690 }
691 #endif
692 
694  return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
695  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
696 }
697 
699  const __m128d neg = _mm_set_pd(-0.0, -0.0);
700  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
701 }
702 
704  return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
705 }
706 
708  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
709 }
711  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
712 }
714  return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
715 }
716 
718  using namespace mathSSE;
719  return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
720  ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
721 }
722 
724  using namespace mathSSE;
725  return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
726  ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
727 }
728 
729 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
730  __m128d res = _mm_set1_pd(a);
731  return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
732 }
733 
734 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
735  __m128d res = _mm_set1_pd(a);
736  return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
737 }
738 
739 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
740  a = 1. / a;
741  return a * b;
742 }
743 
744 // mix algebra (creates ambiguities...)
745 /*
746 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
747  return a + mathSSE::Vec4D(b);
748 }
749 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
750  return a + mathSSE::Vec4D(b);
751 }
752 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
753  return a - mathSSE::Vec4D(b);
754 }
755 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
756  return a - mathSSE::Vec4D(b);
757 }
758 */
759 
760 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
761 
762 inline double __attribute__((always_inline)) __attribute__((pure)) dot3(mathSSE::Vec4D a, mathSSE::Vec4D b) {
763  return dot(a, b);
764 }
765 
766 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
767  __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
768  res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
769  double s;
770  _mm_store_sd(&s, res);
771  return s;
772 }
773 
775 
777  const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
778  // lh .z * rh .x , lh .z * rh .y
779  __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
780  // rh .z * lh .x , rh .z * lh .y
781  __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
782  __m128d m1 = _mm_sub_pd(l1, l2); // l1 - l2
783  m1 = _mm_shuffle_pd(m1, m1, 1); // switch the elements
784  m1 = __m128d(_mm_xor_si128(__m128i(m1), neg)); // change the sign of the first element
785  // lh .x * rh .y , lh .y * rh .x
786  l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
787  // lh .x * rh .y - lh .y * rh .x
788  __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
789  return mathSSE::Vec4D(m1, m2);
790 }
791 
792 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
793  return dot(a.xy(), b.xy());
794 }
795 
796 #endif // CMS_USE_AVX2
797 
798 // sqrt
799 namespace mathSSE {
800  template <>
801  inline Vec4F sqrt(Vec4F v) {
802  return _mm_sqrt_ps(v.vec);
803  }
804  template <>
805  inline Vec2F sqrt(Vec2F v) {
806  return sqrt(Vec4F(v));
807  }
808  template <>
809  inline Vec2D sqrt(Vec2D v) {
810  return _mm_sqrt_pd(v.vec);
811  }
812 #ifdef CMS_USE_AVX2
813  template <>
814  inline Vec4D sqrt(Vec4D v) {
815  return _mm256_sqrt_pd(v.vec);
816  }
817 #else
818  template <>
819  inline Vec4D sqrt(Vec4D v) {
820  return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
821  }
822 #endif
823 } // namespace mathSSE
824 
825 #endif // CMS_USE_SSE
826 
827 #include <iosfwd>
828 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
829 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
830 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
831 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
832 
833 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
834 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
835 
836 #endif // DataFormat_Math_SSEVec_H
Definition: AVXVec.h:6
T & operator[](unsigned int n)
Definition: SSEVec.h:162
void set1(float f1)
Definition: SSEVec.h:190
Vec4< double > Vec4D
Definition: SSEVec.h:491
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:105
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
ExtVec< T, 2 > Vec2
Definition: ExtVec.h:62
mathSSE::Vec4< double > operator &(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:96
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:99
Vec4< float > Vec4F
Definition: SSEVec.h:487
ret
prodAgent to be discontinued
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
T arr[2]
Definition: SSEVec.h:166
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
auto dot3(V1 x, V2 y) -> typename std::remove_reference< decltype(x[0])>::type
Definition: ExtVec.h:157
Vec2< T > xy() const
Definition: SSEVec.h:206
Vec4< float > Vec3F
Definition: ExtVec.h:164
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:83
OldVec< T > o
Definition: SSEVec.h:210
Vec4< float > Vec4F
Definition: ExtVec.h:163
Definition: Electron.h:6
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
Definition: SSEVec.cc:15
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
Vec4< double > Vec4D
Definition: ExtVec.h:167
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:177
Vec2< float > Vec2F
Definition: ExtVec.h:162
Vec4 get1() const
Definition: SSEVec.h:197
T sqrt(T t)
Definition: SSEVec.h:23
V const & v
Definition: ExtVec.h:181
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
def template(fileName, svg, replaceme="REPLACEME")
Definition: svgfig.py:521
struct mathSSE::OldVec __attribute__((aligned(16)))
ExtVec< T, 4 > Vec4
Definition: ExtVec.h:60
Vec2 get1() const
Definition: SSEVec.h:144
struct mathSSE::Rot3 __attribute__
T1 operator/(const Phi< T1, Range > &a, const Phi< T1, Range > &b)
Division.
Definition: Phi.h:176
Vec4(float f1)
Definition: SSEVec.h:183
#define N
Definition: blowfish.cc:9
T operator[](unsigned int n) const
Definition: SSEVec.h:164
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:496
Vec2(T f1, T f2)
Definition: SSEVec.h:129
Vec2(Vec2< U > v)
Definition: SSEVec.h:155
double a
Definition: hdecay.h:121
Vec2< double > Vec2D
Definition: ExtVec.h:165
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:102
Vec4< double > Vec3D
Definition: ExtVec.h:166
As3D< V > as3D(V const &v)
Definition: ExtVec.h:185
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec2< T > zw() const
Definition: SSEVec.h:207
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
Vec4< T > const & v
Definition: SSEVec.h:495
long double T
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:87
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:79
T __attribute__((aligned(16))) arr[4]
Vec2(T f1)
Definition: SSEVec.h:133