CMS 3D CMS Logo

SSEVec.h
Go to the documentation of this file.
1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
5  !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__)
6 #if defined(__GNUC__)
7 #include <x86intrin.h>
8 #define CMS_USE_SSE
9 #ifdef __AVX2__
10 #define CMS_USE_AVX2
11 #endif /* __AVX2__ */
12 #endif /* defined(__GNUC__) */
13 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
14 
15 #include <cmath>
16 
17 namespace mathSSE {
18  template <typename T>
19  inline T sqrt(T t) {
20  return std::sqrt(t);
21  }
22 } // namespace mathSSE
23 
24 namespace mathSSE {
25 #ifdef CMS_USE_SSE
26  //dot
27  inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
28 #ifdef __SSE4_1__
29  return _mm_dp_ps(v1, v2, 0xff);
30 #else
31  __m128 mul = _mm_mul_ps(v1, v2);
32 #ifdef __SSE3__
33  mul = _mm_hadd_ps(mul, mul);
34  return _mm_hadd_ps(mul, mul);
35 #else
36  __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37  mul = _mm_add_ps(mul, swp);
38  swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39  return _mm_add_ps(mul, swp);
40 #endif
41 #endif
42  }
43 
44  // cross (just 3x3)
45  inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
46  // same order is _MM_SHUFFLE(3,2,1,0)
47  // x2, z1,z1
48  __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
49  // y1, x2,y2
50  __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
51 
52  __m128 v5 = _mm_mul_ps(v3, v4);
53 
54  // x1, z2,z2
55  v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
56  // y2, x1,y1
57  v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
58 
59  v3 = _mm_mul_ps(v3, v4);
60  const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
61  __m128i ret = __m128i(_mm_sub_ps(v5, v3));
62  return __m128(_mm_xor_si128(ret, neg));
63  }
64 #endif // CMS_USE_SSE
65 
66 #ifdef CMS_USE_AVX2
67  inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
68  __m256d mul = _mm256_mul_pd(v1, v2);
69  mul = _mm256_hadd_pd(mul, mul);
70  __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
71  return _mm256_add_pd(mul, tmp);
72  }
73 
74  inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
75  __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
76  v3 = _mm256_permute_pd(v3, 0);
77 
78  __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
79  v4 = _mm256_permute_pd(v4, 5);
80 
81  __m256d v5 = _mm256_mul_pd(v3, v4);
82 
83  v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
84  v3 = _mm256_permute_pd(v3, 0);
85 
86  v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
87  v4 = _mm256_permute_pd(v4, 5);
88 
89  v3 = _mm256_mul_pd(v3, v4);
90  __m256d ret = _mm256_sub_pd(v5, v3);
91  const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
92  return __m256d(_mm256_xor_si256(__m256i(ret), neg));
93  }
94 
95 #endif // CMS_USE_AVX2
96 
97  template <typename T>
98  struct OldVec {
103  } __attribute__((aligned(16)));
104 #ifdef CMS_USE_AVX2
105  template <>
106  struct OldVec<double> {
107  double theX;
108  double theY;
109  double theZ;
110  double theW;
111  } __attribute__((aligned(32)));
112 #endif
113 
114  template <typename T>
115  union Vec4;
116 
117  template <typename T>
118  union Vec2 {
119  Vec2() {
120  arr[0] = 0;
121  arr[1] = 0;
122  }
123  Vec2(T f1, T f2) {
124  arr[0] = f1;
125  arr[1] = f2;
126  }
127  explicit Vec2(T f1) {
128  arr[0] = f1;
129  arr[1] = f1;
130  }
131 
132  void set(T f1, T f2) {
133  arr[0] = f1;
134  arr[1] = f2;
135  }
136 
137  template <int N>
138  Vec2 get1() const {
139  return Vec2(arr[N], arr[N]);
140  }
141 
142  /*
143  Vec2 get1(unsigned int n) const {
144  return Vec2(arr[n],arr[n]);
145  }
146  */
147 
148  template <typename U>
150  arr[0] = v[0];
151  arr[1] = v[1];
152  }
153 
154  inline Vec2(Vec4<T> v4);
155 
156  T &operator[](unsigned int n) { return arr[n]; }
157 
158  T operator[](unsigned int n) const { return arr[n]; }
159 
160  T arr[2];
161  };
162 
163  template <typename T>
164  union Vec4 {
165  Vec4() {
166  arr[0] = 0;
167  arr[1] = 0;
168  arr[2] = 0;
169  arr[3] = 0;
170  }
171  Vec4(float f1, float f2, float f3, float f4 = 0) {
172  arr[0] = f1;
173  arr[1] = f2;
174  arr[2] = f3;
175  arr[3] = f4;
176  }
177  explicit Vec4(float f1) { set1(f1); }
178  void set(float f1, float f2, float f3, float f4 = 0) {
179  arr[0] = f1;
180  arr[1] = f2;
181  arr[2] = f3;
182  arr[3] = f4;
183  }
184  void set1(float f1) {
185  arr[0] = f1;
186  arr[1] = f1;
187  arr[2] = f1;
188  arr[3] = f1;
189  }
190  template <int N>
191  Vec4 get1() const {
192  return Vec4(arr[N], arr[N], arr[N], arr[N]);
193  }
194  /*
195  Vec4 get1(unsigned int n) const {
196  return Vec4(arr[n],arr[n],arr[n],arr[n]);
197  }
198  */
199 
200  Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
201  Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
202 
203  T __attribute__((aligned(16))) arr[4];
205  };
206 
207  template <typename T>
208  inline Vec2<T>::Vec2(Vec4<T> v4) {
209  arr[0] = v4[0];
210  arr[1] = v4[1];
211  }
212 
213 #ifdef CMS_USE_SSE
214 
215  template <>
216  union Vec2<double>;
217  template <>
218  union Vec4<double>;
219 
220  template <typename T>
221  union Mask2 {};
222  template <typename T>
223  union Mask4 {};
224 
225  template <>
226  union Mask4<float> {
227  __m128 vec;
228  unsigned int __attribute__((aligned(16))) mask[4];
229  Mask4() { vec = _mm_setzero_ps(); }
230  Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
231  mask[0] = m1;
232  mask[1] = m2;
233  mask[2] = m3;
234  mask[3] = m4;
235  }
236  };
237 
238 #ifdef CMS_USE_AVX2
239  template <>
240  union Mask4<double> {
241  __m256d vec;
242  unsigned long long __attribute__((aligned(32))) mask[4];
243  Mask4() { vec = _mm256_setzero_pd(); }
244  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
245  mask[0] = m1;
246  mask[1] = m2;
247  mask[2] = m3;
248  mask[3] = m4;
249  }
250  };
251 #else
252  template <>
253  union Mask4<double> {
254  __m128d vec[2];
255  unsigned long long __attribute__((aligned(16))) mask[4];
256  Mask4() {
257  vec[0] = _mm_setzero_pd();
258  vec[1] = _mm_setzero_pd();
259  }
260  Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
261  mask[0] = m1;
262  mask[1] = m2;
263  mask[2] = m3;
264  mask[3] = m4;
265  }
266  };
267 #endif
268 
269  template <>
270  union Mask2<double> {
271  __m128d vec;
272  unsigned long long __attribute__((aligned(16))) mask[2];
273  Mask2() { vec = _mm_setzero_pd(); }
274  Mask2(unsigned long long m1, unsigned long long m2) {
275  mask[0] = m1;
276  mask[1] = m2;
277  }
278  };
279 
280  template <>
281  union Vec4<float> {
282  typedef __m128 nativeType;
283  __m128 vec;
284  float __attribute__((aligned(16))) arr[4];
285  OldVec<float> o;
286 
287  Vec4(__m128 ivec) : vec(ivec) {}
288 
289  Vec4(OldVec<float> const &ivec) : o(ivec) {}
290 
291  Vec4() { vec = _mm_setzero_ps(); }
292 
293  inline Vec4(Vec4<double> ivec);
294 
295  explicit Vec4(float f1) { set1(f1); }
296 
297  Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
298 
299  Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
300  arr[0] = ivec0.arr[0];
301  arr[1] = ivec0.arr[1];
302  arr[2] = ivec1.arr[0];
303  arr[3] = ivec1.arr[1];
304  }
305 
306  Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
307  arr[0] = ivec0.arr[0];
308  arr[1] = ivec0.arr[1];
309  arr[2] = f3;
310  arr[3] = f4;
311  }
312 
313  Vec4(Vec2<float> ivec0) {
314  vec = _mm_setzero_ps();
315  arr[0] = ivec0.arr[0];
316  arr[1] = ivec0.arr[1];
317  }
318 
319  // for masking
320  void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
321  Mask4<float> mask(m1, m2, m3, m4);
322  vec = mask.vec;
323  }
324 
325  void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
326 
327  void set1(float f1) { vec = _mm_set1_ps(f1); }
328 
329  template <int N>
330  Vec4 get1() const {
331  return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
332  }
333 
334  float &operator[](unsigned int n) { return arr[n]; }
335 
336  float operator[](unsigned int n) const { return arr[n]; }
337 
338  Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
339  Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
340  };
341 
342  template <>
343  union Vec2<double> {
344  typedef __m128d nativeType;
345  __m128d vec;
346  double __attribute__((aligned(16))) arr[2];
347 
348  Vec2(__m128d ivec) : vec(ivec) {}
349 
350  Vec2() { vec = _mm_setzero_pd(); }
351 
352  inline Vec2(Vec2<float> ivec);
353 
354  Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
355 
356  explicit Vec2(double f1) { set1(f1); }
357 
358  inline Vec2(Vec4<double> v4);
359 
360  // for masking
361  void setMask(unsigned long long m1, unsigned long long m2) {
362  Mask2<double> mask(m1, m2);
363  vec = mask.vec;
364  }
365 
366  void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
367 
368  void set1(double f1) { vec = _mm_set1_pd(f1); }
369 
370  template <int N>
371  Vec2 get1() const {
372  return Vec2(arr[N], arr[N]);
373  }
374  /*
375  Vec2 get1(unsigned int n) const {
376  return Vec2(arr[n],arr[n]);
377  }
378  */
379  double &operator[](unsigned int n) { return arr[n]; }
380  double operator[](unsigned int n) const { return arr[n]; }
381  };
382 
383 #ifdef CMS_USE_AVX2
384 } // namespace mathSSE
385 #include "private/AVXVec.h"
386 
387 namespace mathSSE {
388 #else
389  template <>
390  union Vec4<double> {
391  __m128d vec[2];
392  double __attribute__((aligned(16))) arr[4];
393  OldVec<double> o;
394 
395  Vec4(__m128d ivec[]) {
396  vec[0] = ivec[0];
397  vec[1] = ivec[1];
398  }
399 
400  Vec4(__m128d ivec0, __m128d ivec1) {
401  vec[0] = ivec0;
402  vec[1] = ivec1;
403  }
404 
405  Vec4() {
406  vec[0] = _mm_setzero_pd();
407  vec[1] = _mm_setzero_pd();
408  }
409 
410  Vec4(Vec4<float> ivec) {
411  vec[0] = _mm_cvtps_pd(ivec.vec);
412  vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
413  }
414 
415  explicit Vec4(double f1) { set1(f1); }
416 
417  Vec4(double f1, double f2, double f3, double f4 = 0) {
418  arr[0] = f1;
419  arr[1] = f2;
420  arr[2] = f3;
421  arr[3] = f4;
422  }
423 
424  Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
425  vec[0] = ivec0.vec;
426  vec[1] = ivec1.vec;
427  }
428 
429  Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
430  vec[0] = ivec0.vec;
431  arr[2] = f3;
432  arr[3] = f4;
433  }
434 
435  Vec4(Vec2<double> ivec0) {
436  vec[0] = ivec0.vec;
437  vec[1] = _mm_setzero_pd();
438  }
439 
440  Vec4(OldVec<double> const &ivec) : o(ivec) {}
441 
442  // for masking
443  void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
444  Mask4<double> mask(m1, m2, m3, m4);
445  vec[0] = mask.vec[0];
446  vec[1] = mask.vec[1];
447  }
448 
449  void set(double f1, double f2, double f3, double f4 = 0) {
450  arr[0] = f1;
451  arr[1] = f2;
452  arr[2] = f3;
453  arr[3] = f4;
454  }
455 
456  void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
457 
458  template <int N>
459  Vec4 get1() const {
460  return Vec4(arr[N], arr[N], arr[N], arr[N]);
461  }
462 
463  double &operator[](unsigned int n) { return arr[n]; }
464 
465  double operator[](unsigned int n) const { return arr[n]; }
466 
467  Vec2<double> xy() const { return vec[0]; }
468  Vec2<double> zw() const { return vec[1]; }
469  };
470 
471  inline Vec4<float>::Vec4(Vec4<double> ivec) {
472  vec = _mm_cvtpd_ps(ivec.vec[0]);
473  __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
474  vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
475  }
476 #endif // CMS_USE_AVX2
477 
478 #endif // CMS_USE_SSE
479 
486 
487  template <typename T>
488  struct As3D {
489  Vec4<T> const &v;
490  As3D(Vec4<T> const &iv) : v(iv) {}
491  };
492 
493  template <typename T>
494  inline As3D<T> as3D(Vec4<T> const &v) {
495  return v;
496  }
497 
498 } // namespace mathSSE
499 
500 #ifdef CMS_USE_SSE
501 
502 //float op
503 
505  return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
506 }
507 
508 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
509 
510 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
511 
512 #ifdef __SSE3__
513 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
514 #endif
515 
517  const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
518  return _mm_xor_ps(a.vec, neg);
519 }
520 
521 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
522 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
523 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
524 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
525 
526 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
527 
528 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
529 
530 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
531 
532 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
533 
534 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
535 
536 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
537 
538 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
539 
540 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
541 
542 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
543 
544 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
545  using mathSSE::_mm_dot_ps;
546  float s;
547  _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
548  return s;
549 }
550 
552  using mathSSE::_mm_cross_ps;
553  return _mm_cross_ps(a.vec, b.vec);
554 }
555 
556 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
557  mathSSE::Vec4F mul = a * b;
558 #ifdef __SSE3__
559  mul = hadd(mul, mul);
560 #else
561  __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
562  mul.vec = _mm_add_ps(mul.vec, swp);
563 #endif
564  float s;
565  _mm_store_ss(&s, mul.vec);
566  return s;
567 }
568 
569 //
570 // float op 2d (use the 4d one...)
571 //
573 
575 
577 
579 
581 
583 
585 
586 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
587 
588 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
589 
590 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
591 
592 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
593 
594 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
595 
596 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
597 
598 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
599 
601 // double op 2d
602 //
603 
604 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
605 #ifdef CMS_USE_AVX2
606  vec = _mm256_castpd256_pd128(v4.vec);
607 #else
608  vec = v4.vec[0];
609 #endif
610 }
611 
612 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
613  arr[0] = ivec.arr[0];
614  arr[1] = ivec.arr[1];
615 }
616 
618  const __m128d neg = _mm_set_pd(-0.0, -0.0);
619  return _mm_xor_pd(a.vec, neg);
620 }
621 
622 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
623 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
624 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
625 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
626 
627 #ifdef __SSE3__
628 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
629 #endif
630 
631 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
632 
633 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
634 
635 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
636 
637 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
638 
639 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
640 
641 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
642 
643 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
644 
645 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
646 
647 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
648 
649 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
650 
651 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
652  __m128d res = _mm_mul_pd(a.vec, b.vec);
653 #ifdef __SSE3__
654  res = _mm_hadd_pd(res, res);
655 #else
656  res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
657 #endif
658  double s;
659  _mm_store_sd(&s, res);
660  return s;
661 }
662 
663 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
664 
665 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
666  __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
667  res = _mm_mul_pd(a.vec, res);
668  res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
669  double s;
670  _mm_store_sd(&s, res);
671  return s;
672 }
673 
674 #ifndef CMS_USE_AVX2
675 // double op 3d
676 
677 #ifdef __SSE3__
678 // consistent with AVX...
680  return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
681  hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
682 }
683 #endif
684 
686  return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
687  _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
688 }
689 
691  const __m128d neg = _mm_set_pd(-0.0, -0.0);
692  return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
693 }
694 
696  return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
697 }
698 
700  return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
701 }
703  return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
704 }
706  return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
707 }
708 
710  using namespace mathSSE;
711  return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
712  ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
713 }
714 
716  using namespace mathSSE;
717  return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
718  ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
719 }
720 
721 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
722  __m128d res = _mm_set1_pd(a);
723  return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
724 }
725 
726 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
727  __m128d res = _mm_set1_pd(a);
728  return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
729 }
730 
731 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
732  a = 1. / a;
733  return a * b;
734 }
735 
736 // mix algebra (creates ambiguities...)
737 /*
738 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
739  return a + mathSSE::Vec4D(b);
740 }
741 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
742  return a + mathSSE::Vec4D(b);
743 }
744 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
745  return a - mathSSE::Vec4D(b);
746 }
747 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
748  return a - mathSSE::Vec4D(b);
749 }
750 */
751 
752 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
753 
754 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
755  __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
756  res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
757  double s;
758  _mm_store_sd(&s, res);
759  return s;
760 }
761 
763 
765  const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
766  // lh .z * rh .x , lh .z * rh .y
767  __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
768  // rh .z * lh .x , rh .z * lh .y
769  __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
770  __m128d m1 = _mm_sub_pd(l1, l2); // l1 - l2
771  m1 = _mm_shuffle_pd(m1, m1, 1); // switch the elements
772  m1 = __m128d(_mm_xor_si128(__m128i(m1), neg)); // change the sign of the first element
773  // lh .x * rh .y , lh .y * rh .x
774  l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
775  // lh .x * rh .y - lh .y * rh .x
776  __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
777  return mathSSE::Vec4D(m1, m2);
778 }
779 
780 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
781  return dot(a.xy(), b.xy());
782 }
783 
784 #endif // CMS_USE_AVX2
785 
786 // sqrt
787 namespace mathSSE {
788  template <>
789  inline Vec4F sqrt(Vec4F v) {
790  return _mm_sqrt_ps(v.vec);
791  }
792  template <>
793  inline Vec2F sqrt(Vec2F v) {
794  return sqrt(Vec4F(v));
795  }
796  template <>
797  inline Vec2D sqrt(Vec2D v) {
798  return _mm_sqrt_pd(v.vec);
799  }
800 #ifdef CMS_USE_AVX2
801  template <>
802  inline Vec4D sqrt(Vec4D v) {
803  return _mm256_sqrt_pd(v.vec);
804  }
805 #else
806  template <>
807  inline Vec4D sqrt(Vec4D v) {
808  return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
809  }
810 #endif
811 } // namespace mathSSE
812 
813 #endif // CMS_USE_SSE
814 
815 #include <iosfwd>
816 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
817 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
818 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
819 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
820 
821 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
822 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
823 
824 #endif // DataFormat_Math_SSEVec_H
Definition: AVXVec.h:6
T & operator[](unsigned int n)
Definition: SSEVec.h:156
void set1(float f1)
Definition: SSEVec.h:184
Vec4< double > Vec4D
Definition: SSEVec.h:485
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:105
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
ExtVec< T, 2 > Vec2
Definition: ExtVec.h:62
mathSSE::Vec4< double > operator &(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:96
int32_t *__restrict__ iv
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:99
Vec4< float > Vec4F
Definition: SSEVec.h:481
ret
prodAgent to be discontinued
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
T arr[2]
Definition: SSEVec.h:160
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec2< T > xy() const
Definition: SSEVec.h:200
Vec4< float > Vec3F
Definition: ExtVec.h:149
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:83
OldVec< T > o
Definition: SSEVec.h:204
Vec4< float > Vec4F
Definition: ExtVec.h:148
Definition: Electron.h:6
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
Definition: SSEVec.cc:15
constexpr uint32_t mask
Definition: gpuClustering.h:26
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
Vec4< double > Vec4D
Definition: ExtVec.h:152
Vec4(float f1, float f2, float f3, float f4=0)
Definition: SSEVec.h:171
Vec2< float > Vec2F
Definition: ExtVec.h:147
Vec4 get1() const
Definition: SSEVec.h:191
T sqrt(T t)
Definition: SSEVec.h:19
V const & v
Definition: ExtVec.h:166
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
def template(fileName, svg, replaceme="REPLACEME")
Definition: svgfig.py:521
struct mathSSE::OldVec __attribute__((aligned(16)))
ExtVec< T, 4 > Vec4
Definition: ExtVec.h:60
Vec2 get1() const
Definition: SSEVec.h:138
struct mathSSE::Rot3 __attribute__
T1 operator/(const Phi< T1, Range > &a, const Phi< T1, Range > &b)
Division.
Definition: Phi.h:176
Vec4(float f1)
Definition: SSEVec.h:177
#define N
Definition: blowfish.cc:9
T operator[](unsigned int n) const
Definition: SSEVec.h:158
double b
Definition: hdecay.h:120
As3D(Vec4< T > const &iv)
Definition: SSEVec.h:490
Vec2(T f1, T f2)
Definition: SSEVec.h:123
Vec2(Vec2< U > v)
Definition: SSEVec.h:149
double a
Definition: hdecay.h:121
Vec2< double > Vec2D
Definition: ExtVec.h:150
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:102
Vec4< double > Vec3D
Definition: ExtVec.h:151
As3D< V > as3D(V const &v)
Definition: ExtVec.h:170
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Vec2< T > zw() const
Definition: SSEVec.h:201
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
Vec4< T > const & v
Definition: SSEVec.h:489
long double T
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:87
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Definition: AVXVec.h:79
T __attribute__((aligned(16))) arr[4]
Vec2(T f1)
Definition: SSEVec.h:127