CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
sse_mathfun.h
Go to the documentation of this file.
1 #ifndef SSE_MATHFUN_H
2 #define SSE_MATHFUN_H
3 
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__)
5 #if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) || defined(__clang__)
6 #include <x86intrin.h>
7 #define CMS_USE_SSE
8 
9 #else
10 
11 #ifdef __SSE2__
12 #define CMS_USE_SSE
13 
14 #include <mmintrin.h>
15 #include <emmintrin.h>
16 #endif /* __SSE2__ */
17 #ifdef __SSE3__
18 #include <pmmintrin.h>
19 #endif /* __SSE3__ */
20 #ifdef __SSE4_1__
21 #include <smmintrin.h>
22 #endif /* __SSE4_1__ */
23 
24 #endif /* (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) || defined(__clang__) */
25 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
26 
27 #ifdef CMS_USE_SSE
28 
29 
30 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
31 
32  Inspired by Intel Approximate Math library, and based on the
33  corresponding algorithms of the cephes math library
34 
35  The default is to use the SSE1 version. If you define USE_SSE2 the
36  the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
37  not expect any significant performance improvement with SSE2.
38 */
39 
40 /* Copyright (C) 2007 Julien Pommier
41 
42  This software is provided 'as-is', without any express or implied
43  warranty. In no event will the authors be held liable for any damages
44  arising from the use of this software.
45 
46  Permission is granted to anyone to use this software for any purpose,
47  including commercial applications, and to alter it and redistribute it
48  freely, subject to the following restrictions:
49 
50  1. The origin of this software must not be misrepresented; you must not
51  claim that you wrote the original software. If you use this software
52  in a product, an acknowledgment in the product documentation would be
53  appreciated but is not required.
54  2. Altered source versions must be plainly marked as such, and must not be
55  misrepresented as being the original software.
56  3. This notice may not be removed or altered from any source distribution.
57 
58  (this is the zlib license)
59 */
60 
61 
62 #define USE_SSE2
63 #include <xmmintrin.h>
64 
65 /* yes I know, the top of this file is quite ugly */
66 
67 #ifdef _MSC_VER /* visual c++ */
68 # define ALIGN16_BEG __declspec(align(16))
69 # define ALIGN16_END
70 #else /* gcc or icc */
71 # define ALIGN16_BEG
72 # define ALIGN16_END __attribute__((aligned(16)))
73 #endif
74 
75 /* __m128 is ugly to write */
76 typedef __m128 v4sf; // vector of 4 float (sse1)
77 
78 #ifdef USE_SSE2
79 # include <emmintrin.h>
80 typedef __m128i v4si; // vector of 4 int (sse2)
81 #else
82 typedef __m64 v2si; // vector of 2 int (mmx)
83 #endif
84 
85 /* declare some SSE constants -- why can't I figure a better way to do that? */
86 #define _PS_CONST(Name, Val) \
87  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
88 #define _PI32_CONST(Name, Val) \
89  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
90 #define _PS_CONST_TYPE(Name, Type, Val) \
91  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
92 
93 _PS_CONST(1 , 1.0f);
94 _PS_CONST(0p5, 0.5f);
95 /* the smallest non denormalized float number */
96 _PS_CONST_TYPE(min_norm_pos, unsigned int, 0x00800000U);
97 _PS_CONST_TYPE(mant_mask, unsigned int, 0x7f800000U);
98 _PS_CONST_TYPE(inv_mant_mask, unsigned int, ~0x7f800000U);
99 
100 _PS_CONST_TYPE(sign_mask, unsigned int, 0x80000000U);
101 _PS_CONST_TYPE(inv_sign_mask, unsigned int, ~0x80000000U);
102 
103 _PI32_CONST(1, 1);
104 _PI32_CONST(inv1, ~1);
105 _PI32_CONST(2, 2);
106 _PI32_CONST(4, 4);
107 _PI32_CONST(0x7f, 0x7f);
108 
109 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
110 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
111 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
112 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
113 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
114 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
115 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
116 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
117 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
118 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
119 _PS_CONST(cephes_log_q1, -2.12194440e-4);
120 _PS_CONST(cephes_log_q2, 0.693359375);
121 
122 #if defined (__MINGW32__)
123 
124 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
125  The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
126  broken on my mingw gcc 3.4.5 ...
127 
128  Note that the bug on _mm_cmp* does occur only at -O0 optimization level
129 */
130 
131 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
132  asm (
133  "movhlps %2,%0\n\t"
134  : "=x" (a)
135  : "0" (a), "x"(b)
136  );
137  return a; }
138 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
139 #define _mm_movehl_ps my_movehl_ps
140 
141 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
142  asm (
143  "cmpltps %2,%0\n\t"
144  : "=x" (a)
145  : "0" (a), "x"(b)
146  );
147  return a;
148  }
149 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
150  asm (
151  "cmpnleps %2,%0\n\t"
152  : "=x" (a)
153  : "0" (a), "x"(b)
154  );
155  return a;
156 }
157 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
158  asm (
159  "cmpeqps %2,%0\n\t"
160  : "=x" (a)
161  : "0" (a), "x"(b)
162  );
163  return a;
164 }
165 #warning "redefined _mm_cmpxx_ps functions..."
166 #define _mm_cmplt_ps my_cmplt_ps
167 #define _mm_cmpgt_ps my_cmpgt_ps
168 #define _mm_cmpeq_ps my_cmpeq_ps
169 #endif
170 
171 #ifndef USE_SSE2
172 typedef union xmm_mm_union {
173  __m128 xmm;
174  __m64 mm[2];
175 } xmm_mm_union;
176 
177 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
178  xmm_mm_union u; u.xmm = xmm_; \
179  mm0_ = u.mm[0]; \
180  mm1_ = u.mm[1]; \
181 }
182 
183 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
184  xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
185  }
186 
187 #endif // USE_SSE2
188 
189 /* natural logarithm computed for 4 simultaneous float
190  return NaN for x <= 0
191 */
192 inline v4sf log_ps(v4sf x) {
193 #ifdef USE_SSE2
194  v4si emm0;
195 #else
196  v2si mm0, mm1;
197 #endif
198  v4sf one = *(v4sf*)_ps_1;
199 
200  v4sf invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
201  // propagate nan
202  invalid_mask = _mm_or_ps(invalid_mask ,_mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps()));
203 
204  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
205 
206 #ifndef USE_SSE2
207  /* part 1: x = frexpf(x, &e); */
208  COPY_XMM_TO_MM(x, mm0, mm1);
209  mm0 = _mm_srli_pi32(mm0, 23);
210  mm1 = _mm_srli_pi32(mm1, 23);
211 #else
212  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
213 #endif
214  /* keep only the fractional part */
215  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
216  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
217 
218 #ifndef USE_SSE2
219  /* now e=mm0:mm1 contain the really base-2 exponent */
220  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
221  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
222  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
223  _mm_empty(); /* bye bye mmx */
224 #else
225  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
226  v4sf e = _mm_cvtepi32_ps(emm0);
227 #endif
228 
229  e = _mm_add_ps(e, one);
230 
231  /* part2:
232  if( x < SQRTHF ) {
233  e -= 1;
234  x = x + x - 1.0;
235  } else { x = x - 1.0; }
236  */
237  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
238  v4sf tmp = _mm_and_ps(x, mask);
239  x = _mm_sub_ps(x, one);
240  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
241  x = _mm_add_ps(x, tmp);
242 
243 
244  v4sf z = _mm_mul_ps(x,x);
245 
246  v4sf y = *(v4sf*)_ps_cephes_log_p0;
247  y = _mm_mul_ps(y, x);
248  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
249  y = _mm_mul_ps(y, x);
250  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
251  y = _mm_mul_ps(y, x);
252  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
253  y = _mm_mul_ps(y, x);
254  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
255  y = _mm_mul_ps(y, x);
256  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
257  y = _mm_mul_ps(y, x);
258  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
259  y = _mm_mul_ps(y, x);
260  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
261  y = _mm_mul_ps(y, x);
262  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
263  y = _mm_mul_ps(y, x);
264 
265  y = _mm_mul_ps(y, z);
266 
267 
268  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
269  y = _mm_add_ps(y, tmp);
270 
271 
272  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
273  y = _mm_sub_ps(y, tmp);
274 
275  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
276  x = _mm_add_ps(x, y);
277  x = _mm_add_ps(x, tmp);
278  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN (and NaN as well!)
279  return x;
280 }
281 
282 _PS_CONST(exp_hi, 88.3762626647949f);
283 _PS_CONST(exp_lo, -88.3762626647949f);
284 
285 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
286 _PS_CONST(cephes_exp_C1, 0.693359375);
287 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
288 
289 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
290 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
291 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
292 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
293 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
294 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
295 
296 inline v4sf exp_ps(v4sf x) {
297  v4sf tmp, fx;
298 #ifdef USE_SSE2
299  v4si emm0;
300 #else
301  v2si mm0, mm1;
302 #endif
303  v4sf one = *(v4sf*)_ps_1;
304 
305  // propagate nan
306  v4sf invalid_mask = _mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps());
307 
308 
309  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
310  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
311 
312  /* express exp(x) as exp(g + n*log(2)) */
313  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
314  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
315 
316  /* how to perform a floorf with SSE: just below */
317 #ifndef USE_SSE2
318  /* step 1 : cast to int */
319  tmp = _mm_setzero_ps()
320  tmp = _mm_movehl_ps(tmp, fx);
321  mm0 = _mm_cvttps_pi32(fx);
322  mm1 = _mm_cvttps_pi32(tmp);
323  /* step 2 : cast back to float */
324  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
325 #else
326  emm0 = _mm_cvttps_epi32(fx);
327  tmp = _mm_cvtepi32_ps(emm0);
328 #endif
329  /* if greater, substract 1 */
330  v4sf mask = _mm_cmpgt_ps(tmp, fx);
331  mask = _mm_and_ps(mask, one);
332  fx = _mm_sub_ps(tmp, mask);
333 
334  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
335  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
336  x = _mm_sub_ps(x, tmp);
337  x = _mm_sub_ps(x, z);
338 
339  z = _mm_mul_ps(x,x);
340 
341  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
342  y = _mm_mul_ps(y, x);
343  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
344  y = _mm_mul_ps(y, x);
345  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
346  y = _mm_mul_ps(y, x);
347  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
348  y = _mm_mul_ps(y, x);
349  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
350  y = _mm_mul_ps(y, x);
351  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
352  y = _mm_mul_ps(y, z);
353  y = _mm_add_ps(y, x);
354  y = _mm_add_ps(y, one);
355 
356  /* build 2^n */
357 #ifndef USE_SSE2
358  z = _mm_movehl_ps(z, fx);
359  mm0 = _mm_cvttps_pi32(fx);
360  mm1 = _mm_cvttps_pi32(z);
361  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
362  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
363  mm0 = _mm_slli_pi32(mm0, 23);
364  mm1 = _mm_slli_pi32(mm1, 23);
365 
366  v4sf pow2n;
367  COPY_MM_TO_XMM(mm0, mm1, pow2n);
368  _mm_empty();
369 #else
370  emm0 = _mm_cvttps_epi32(fx);
371  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
372  emm0 = _mm_slli_epi32(emm0, 23);
373  v4sf pow2n = _mm_castsi128_ps(emm0);
374 #endif
375  y = _mm_mul_ps(y, pow2n);
376  y = _mm_or_ps(y, invalid_mask); // propagate NaN
377  return y;
378 }
379 
380 _PS_CONST(minus_cephes_DP1, -0.78515625);
381 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
382 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
383 _PS_CONST(sincof_p0, -1.9515295891E-4);
384 _PS_CONST(sincof_p1, 8.3321608736E-3);
385 _PS_CONST(sincof_p2, -1.6666654611E-1);
386 _PS_CONST(coscof_p0, 2.443315711809948E-005);
387 _PS_CONST(coscof_p1, -1.388731625493765E-003);
388 _PS_CONST(coscof_p2, 4.166664568298827E-002);
389 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
390 
391 
392 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
393  it runs also on old athlons XPs and the pentium III of your grand
394  mother.
395 
396  The code is the exact rewriting of the cephes sinf function.
397  Precision is excellent as long as x < 8192 (I did not bother to
398  take into account the special handling they have for greater values
399  -- it does not return garbage for arguments over 8192, though, but
400  the extra precision is missing).
401 
402  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
403  surprising but correct result.
404 
405  Performance is also surprisingly good, 1.33 times faster than the
406  macos vsinf SSE2 function, and 1.5 times faster than the
407  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
408  too bad for an SSE1 function (with no special tuning) !
409  However the latter libraries probably have a much better handling of NaN,
410  Inf, denormalized and other special arguments..
411 
412  On my core 1 duo, the execution of this function takes approximately 95 cycles.
413 
414  From what I have observed on the experiments with Intel AMath lib, switching to an
415  SSE2 version would improve the perf by only 10%.
416 
417  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
418  deliver full speed.
419 */
420 inline v4sf sin_ps(v4sf x) { // any x
421  v4sf xmm1, xmm2, xmm3, sign_bit, y;
422 
423 #ifdef USE_SSE2
424  v4si emm0, emm2;
425 #else
426  v2si mm0, mm1, mm2, mm3;
427 #endif
428  sign_bit = x;
429  /* take the absolute value */
430  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
431  /* extract the sign bit (upper one) */
432  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
433 
434  /* scale by 4/Pi */
435  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
436 
437  //printf("plop:"); print4(y);
438 #ifdef USE_SSE2
439  /* store the integer part of y in mm0 */
440  emm2 = _mm_cvttps_epi32(y);
441  /* j=(j+1) & (~1) (see the cephes sources) */
442  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
443  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
444  y = _mm_cvtepi32_ps(emm2);
445  /* get the swap sign flag */
446  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
447  emm0 = _mm_slli_epi32(emm0, 29);
448  /* get the polynom selection mask
449  there is one polynom for 0 <= x <= Pi/4
450  and another one for Pi/4<x<=Pi/2
451 
452  Both branches will be computed.
453  */
454  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
455  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
456 
457  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
458  v4sf poly_mask = _mm_castsi128_ps(emm2);
459  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
460 #else
461  /* store the integer part of y in mm0:mm1 */
462  xmm2 = _mm_setzero_ps();
463  xmm2 = _mm_movehl_ps(xmm2, y);
464  mm2 = _mm_cvttps_pi32(y);
465  mm3 = _mm_cvttps_pi32(xmm2);
466  /* j=(j+1) & (~1) (see the cephes sources) */
467  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
468  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
469  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
470  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
471  y = _mm_cvtpi32x2_ps(mm2, mm3);
472  /* get the swap sign flag */
473  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
474  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
475  mm0 = _mm_slli_pi32(mm0, 29);
476  mm1 = _mm_slli_pi32(mm1, 29);
477  /* get the polynom selection mask */
478  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
479  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
480  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
481  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
482  v4sf swap_sign_bit, poly_mask;
483  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
484  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
485  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
486  _mm_empty(); /* good-bye mmx */
487 #endif
488 
489  /* The magic pass: "Extended precision modular arithmetic"
490  x = ((x - y * DP1) - y * DP2) - y * DP3; */
491  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
492  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
493  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
494  xmm1 = _mm_mul_ps(y, xmm1);
495  xmm2 = _mm_mul_ps(y, xmm2);
496  xmm3 = _mm_mul_ps(y, xmm3);
497  x = _mm_add_ps(x, xmm1);
498  x = _mm_add_ps(x, xmm2);
499  x = _mm_add_ps(x, xmm3);
500 
501  /* Evaluate the first polynom (0 <= x <= Pi/4) */
502  y = *(v4sf*)_ps_coscof_p0;
503  v4sf z = _mm_mul_ps(x,x);
504 
505  y = _mm_mul_ps(y, z);
506  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
507  y = _mm_mul_ps(y, z);
508  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
509  y = _mm_mul_ps(y, z);
510  y = _mm_mul_ps(y, z);
511  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
512  y = _mm_sub_ps(y, tmp);
513  y = _mm_add_ps(y, *(v4sf*)_ps_1);
514 
515  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
516 
517  v4sf y2 = *(v4sf*)_ps_sincof_p0;
518  y2 = _mm_mul_ps(y2, z);
519  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
520  y2 = _mm_mul_ps(y2, z);
521  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
522  y2 = _mm_mul_ps(y2, z);
523  y2 = _mm_mul_ps(y2, x);
524  y2 = _mm_add_ps(y2, x);
525 
526  /* select the correct result from the two polynoms */
527  xmm3 = poly_mask;
528  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
529  y = _mm_andnot_ps(xmm3, y);
530  y = _mm_add_ps(y,y2);
531  /* update the sign */
532  y = _mm_xor_ps(y, sign_bit);
533 
534  return y;
535 }
536 
537 /* almost the same as sin_ps */
538 inline v4sf cos_ps(v4sf x) { // any x
539  v4sf xmm1, xmm2, xmm3, y;
540 #ifdef USE_SSE2
541  v4si emm0, emm2;
542 #else
543  v2si mm0, mm1, mm2, mm3;
544 #endif
545  /* take the absolute value */
546  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
547 
548  /* scale by 4/Pi */
549  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
550 
551 #ifdef USE_SSE2
552  /* store the integer part of y in mm0 */
553  emm2 = _mm_cvttps_epi32(y);
554  /* j=(j+1) & (~1) (see the cephes sources) */
555  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
556  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
557  y = _mm_cvtepi32_ps(emm2);
558 
559  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
560 
561  /* get the swap sign flag */
562  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
563  emm0 = _mm_slli_epi32(emm0, 29);
564  /* get the polynom selection mask */
565  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
566  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
567 
568  v4sf sign_bit = _mm_castsi128_ps(emm0);
569  v4sf poly_mask = _mm_castsi128_ps(emm2);
570 #else
571  /* store the integer part of y in mm0:mm1 */
572  xmm2 = _mm_setzero_ps();
573  xmm2 = _mm_movehl_ps(xmm2, y);
574  mm2 = _mm_cvttps_pi32(y);
575  mm3 = _mm_cvttps_pi32(xmm2);
576 
577  /* j=(j+1) & (~1) (see the cephes sources) */
578  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
579  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
580  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
581  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
582 
583  y = _mm_cvtpi32x2_ps(mm2, mm3);
584 
585 
586  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
587  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
588 
589  /* get the swap sign flag in mm0:mm1 and the
590  polynom selection mask in mm2:mm3 */
591 
592  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
593  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
594  mm0 = _mm_slli_pi32(mm0, 29);
595  mm1 = _mm_slli_pi32(mm1, 29);
596 
597  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
598  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
599 
600  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
601  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
602 
603  v4sf sign_bit, poly_mask;
604  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
605  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
606  _mm_empty(); /* good-bye mmx */
607 #endif
608  /* The magic pass: "Extended precision modular arithmetic"
609  x = ((x - y * DP1) - y * DP2) - y * DP3; */
610  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
611  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
612  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
613  xmm1 = _mm_mul_ps(y, xmm1);
614  xmm2 = _mm_mul_ps(y, xmm2);
615  xmm3 = _mm_mul_ps(y, xmm3);
616  x = _mm_add_ps(x, xmm1);
617  x = _mm_add_ps(x, xmm2);
618  x = _mm_add_ps(x, xmm3);
619 
620  /* Evaluate the first polynom (0 <= x <= Pi/4) */
621  y = *(v4sf*)_ps_coscof_p0;
622  v4sf z = _mm_mul_ps(x,x);
623 
624  y = _mm_mul_ps(y, z);
625  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
626  y = _mm_mul_ps(y, z);
627  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
628  y = _mm_mul_ps(y, z);
629  y = _mm_mul_ps(y, z);
630  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
631  y = _mm_sub_ps(y, tmp);
632  y = _mm_add_ps(y, *(v4sf*)_ps_1);
633 
634  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
635 
636  v4sf y2 = *(v4sf*)_ps_sincof_p0;
637  y2 = _mm_mul_ps(y2, z);
638  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
639  y2 = _mm_mul_ps(y2, z);
640  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
641  y2 = _mm_mul_ps(y2, z);
642  y2 = _mm_mul_ps(y2, x);
643  y2 = _mm_add_ps(y2, x);
644 
645  /* select the correct result from the two polynoms */
646  xmm3 = poly_mask;
647  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
648  y = _mm_andnot_ps(xmm3, y);
649  y = _mm_add_ps(y,y2);
650  /* update the sign */
651  y = _mm_xor_ps(y, sign_bit);
652 
653  return y;
654 }
655 
656 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
657  it is almost as fast, and gives you a free cosine with your sine */
658 inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
659  v4sf xmm1, xmm2, xmm3, sign_bit_sin, y;
660 #ifdef USE_SSE2
661  v4si emm0, emm2, emm4;
662 #else
663  v2si mm0, mm1, mm2, mm3, mm4, mm5;
664 #endif
665  sign_bit_sin = x;
666  /* take the absolute value */
667  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
668  /* extract the sign bit (upper one) */
669  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
670 
671  /* scale by 4/Pi */
672  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
673 
674 #ifdef USE_SSE2
675  /* store the integer part of y in emm2 */
676  emm2 = _mm_cvttps_epi32(y);
677 
678  /* j=(j+1) & (~1) (see the cephes sources) */
679  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
680  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
681  y = _mm_cvtepi32_ps(emm2);
682 
683  emm4 = emm2;
684 
685  /* get the swap sign flag for the sine */
686  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
687  emm0 = _mm_slli_epi32(emm0, 29);
688  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
689 
690  /* get the polynom selection mask for the sine*/
691  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
692  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
693  v4sf poly_mask = _mm_castsi128_ps(emm2);
694 #else
695  /* store the integer part of y in mm2:mm3 */
696  xmm3 = _mm_setzero_ps();
697  xmm3 = _mm_movehl_ps(xmm3, y);
698  mm2 = _mm_cvttps_pi32(y);
699  mm3 = _mm_cvttps_pi32(xmm3);
700 
701  /* j=(j+1) & (~1) (see the cephes sources) */
702  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
703  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
704  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
705  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
706 
707  y = _mm_cvtpi32x2_ps(mm2, mm3);
708 
709  mm4 = mm2;
710  mm5 = mm3;
711 
712  /* get the swap sign flag for the sine */
713  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
714  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
715  mm0 = _mm_slli_pi32(mm0, 29);
716  mm1 = _mm_slli_pi32(mm1, 29);
717  v4sf swap_sign_bit_sin;
718  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
719 
720  /* get the polynom selection mask for the sine */
721 
722  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
723  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
724  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
725  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
726  v4sf poly_mask;
727  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
728 #endif
729 
730  /* The magic pass: "Extended precision modular arithmetic"
731  x = ((x - y * DP1) - y * DP2) - y * DP3; */
732  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
733  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
734  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
735  xmm1 = _mm_mul_ps(y, xmm1);
736  xmm2 = _mm_mul_ps(y, xmm2);
737  xmm3 = _mm_mul_ps(y, xmm3);
738  x = _mm_add_ps(x, xmm1);
739  x = _mm_add_ps(x, xmm2);
740  x = _mm_add_ps(x, xmm3);
741 
742 #ifdef USE_SSE2
743  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
744  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
745  emm4 = _mm_slli_epi32(emm4, 29);
746  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
747 #else
748  /* get the sign flag for the cosine */
749  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
750  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
751  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
752  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
753  mm4 = _mm_slli_pi32(mm4, 29);
754  mm5 = _mm_slli_pi32(mm5, 29);
755  v4sf sign_bit_cos;
756  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
757  _mm_empty(); /* good-bye mmx */
758 #endif
759 
760  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
761 
762 
763  /* Evaluate the first polynom (0 <= x <= Pi/4) */
764  v4sf z = _mm_mul_ps(x,x);
765  y = *(v4sf*)_ps_coscof_p0;
766 
767  y = _mm_mul_ps(y, z);
768  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
769  y = _mm_mul_ps(y, z);
770  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
771  y = _mm_mul_ps(y, z);
772  y = _mm_mul_ps(y, z);
773  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
774  y = _mm_sub_ps(y, tmp);
775  y = _mm_add_ps(y, *(v4sf*)_ps_1);
776 
777  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
778 
779  v4sf y2 = *(v4sf*)_ps_sincof_p0;
780  y2 = _mm_mul_ps(y2, z);
781  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
782  y2 = _mm_mul_ps(y2, z);
783  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
784  y2 = _mm_mul_ps(y2, z);
785  y2 = _mm_mul_ps(y2, x);
786  y2 = _mm_add_ps(y2, x);
787 
788  /* select the correct result from the two polynoms */
789  xmm3 = poly_mask;
790  v4sf ysin2 = _mm_and_ps(xmm3, y2);
791  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
792  y2 = _mm_sub_ps(y2,ysin2);
793  y = _mm_sub_ps(y, ysin1);
794 
795  xmm1 = _mm_add_ps(ysin1,ysin2);
796  xmm2 = _mm_add_ps(y,y2);
797 
798  /* update the sign */
799  *s = _mm_xor_ps(xmm1, sign_bit_sin);
800  *c = _mm_xor_ps(xmm2, sign_bit_cos);
801 }
802 
803 #endif
804 
805 #endif // SSE_MATHFUN_H
float float float z
double f[11][100]
double b
Definition: hdecay.h:120
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
double a
Definition: hdecay.h:121
Definition: DDAxes.h:10