CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
sse_mathfun.h
Go to the documentation of this file.
1 #ifndef SSE_MATHFUN_H
2 #define SSE_MATHFUN_H
3 
4 #if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) || defined(__clang__)
5 #include <x86intrin.h>
6 #define CMS_USE_SSE
7 
8 #else
9 
10 #ifdef __SSE2__
11 #define CMS_USE_SSE
12 
13 #include <mmintrin.h>
14 #include <emmintrin.h>
15 #endif
16 #ifdef __SSE3__
17 #include <pmmintrin.h>
18 #endif
19 #ifdef __SSE4_1__
20 #include <smmintrin.h>
21 #endif
22 
23 #endif
24 
25 #ifdef CMS_USE_SSE
26 
27 
28 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
29 
30  Inspired by Intel Approximate Math library, and based on the
31  corresponding algorithms of the cephes math library
32 
33  The default is to use the SSE1 version. If you define USE_SSE2 the
34  the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
35  not expect any significant performance improvement with SSE2.
36 */
37 
38 /* Copyright (C) 2007 Julien Pommier
39 
40  This software is provided 'as-is', without any express or implied
41  warranty. In no event will the authors be held liable for any damages
42  arising from the use of this software.
43 
44  Permission is granted to anyone to use this software for any purpose,
45  including commercial applications, and to alter it and redistribute it
46  freely, subject to the following restrictions:
47 
48  1. The origin of this software must not be misrepresented; you must not
49  claim that you wrote the original software. If you use this software
50  in a product, an acknowledgment in the product documentation would be
51  appreciated but is not required.
52  2. Altered source versions must be plainly marked as such, and must not be
53  misrepresented as being the original software.
54  3. This notice may not be removed or altered from any source distribution.
55 
56  (this is the zlib license)
57 */
58 
59 
60 #define USE_SSE2
61 #include <xmmintrin.h>
62 
63 /* yes I know, the top of this file is quite ugly */
64 
65 #ifdef _MSC_VER /* visual c++ */
66 # define ALIGN16_BEG __declspec(align(16))
67 # define ALIGN16_END
68 #else /* gcc or icc */
69 # define ALIGN16_BEG
70 # define ALIGN16_END __attribute__((aligned(16)))
71 #endif
72 
73 /* __m128 is ugly to write */
74 typedef __m128 v4sf; // vector of 4 float (sse1)
75 
76 #ifdef USE_SSE2
77 # include <emmintrin.h>
78 typedef __m128i v4si; // vector of 4 int (sse2)
79 #else
80 typedef __m64 v2si; // vector of 2 int (mmx)
81 #endif
82 
83 /* declare some SSE constants -- why can't I figure a better way to do that? */
84 #define _PS_CONST(Name, Val) \
85  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
86 #define _PI32_CONST(Name, Val) \
87  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
88 #define _PS_CONST_TYPE(Name, Type, Val) \
89  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
90 
91 _PS_CONST(1 , 1.0f);
92 _PS_CONST(0p5, 0.5f);
93 /* the smallest non denormalized float number */
94 _PS_CONST_TYPE(min_norm_pos, unsigned int, 0x00800000U);
95 _PS_CONST_TYPE(mant_mask, unsigned int, 0x7f800000U);
96 _PS_CONST_TYPE(inv_mant_mask, unsigned int, ~0x7f800000U);
97 
98 _PS_CONST_TYPE(sign_mask, unsigned int, 0x80000000U);
99 _PS_CONST_TYPE(inv_sign_mask, unsigned int, ~0x80000000U);
100 
101 _PI32_CONST(1, 1);
102 _PI32_CONST(inv1, ~1);
103 _PI32_CONST(2, 2);
104 _PI32_CONST(4, 4);
105 _PI32_CONST(0x7f, 0x7f);
106 
107 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
108 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
109 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
110 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
111 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
112 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
113 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
114 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
115 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
116 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
117 _PS_CONST(cephes_log_q1, -2.12194440e-4);
118 _PS_CONST(cephes_log_q2, 0.693359375);
119 
120 #if defined (__MINGW32__)
121 
122 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
123  The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
124  broken on my mingw gcc 3.4.5 ...
125 
126  Note that the bug on _mm_cmp* does occur only at -O0 optimization level
127 */
128 
129 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
130  asm (
131  "movhlps %2,%0\n\t"
132  : "=x" (a)
133  : "0" (a), "x"(b)
134  );
135  return a; }
136 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
137 #define _mm_movehl_ps my_movehl_ps
138 
139 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
140  asm (
141  "cmpltps %2,%0\n\t"
142  : "=x" (a)
143  : "0" (a), "x"(b)
144  );
145  return a;
146  }
147 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
148  asm (
149  "cmpnleps %2,%0\n\t"
150  : "=x" (a)
151  : "0" (a), "x"(b)
152  );
153  return a;
154 }
155 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
156  asm (
157  "cmpeqps %2,%0\n\t"
158  : "=x" (a)
159  : "0" (a), "x"(b)
160  );
161  return a;
162 }
163 #warning "redefined _mm_cmpxx_ps functions..."
164 #define _mm_cmplt_ps my_cmplt_ps
165 #define _mm_cmpgt_ps my_cmpgt_ps
166 #define _mm_cmpeq_ps my_cmpeq_ps
167 #endif
168 
169 #ifndef USE_SSE2
170 typedef union xmm_mm_union {
171  __m128 xmm;
172  __m64 mm[2];
173 } xmm_mm_union;
174 
175 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
176  xmm_mm_union u; u.xmm = xmm_; \
177  mm0_ = u.mm[0]; \
178  mm1_ = u.mm[1]; \
179 }
180 
181 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
182  xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
183  }
184 
185 #endif // USE_SSE2
186 
187 /* natural logarithm computed for 4 simultaneous float
188  return NaN for x <= 0
189 */
190 inline v4sf log_ps(v4sf x) {
191 #ifdef USE_SSE2
192  v4si emm0;
193 #else
194  v2si mm0, mm1;
195 #endif
196  v4sf one = *(v4sf*)_ps_1;
197 
198  v4sf invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
199  // propagate nan
200  invalid_mask = _mm_or_ps(invalid_mask ,_mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps()));
201 
202  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
203 
204 #ifndef USE_SSE2
205  /* part 1: x = frexpf(x, &e); */
206  COPY_XMM_TO_MM(x, mm0, mm1);
207  mm0 = _mm_srli_pi32(mm0, 23);
208  mm1 = _mm_srli_pi32(mm1, 23);
209 #else
210  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
211 #endif
212  /* keep only the fractional part */
213  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
214  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
215 
216 #ifndef USE_SSE2
217  /* now e=mm0:mm1 contain the really base-2 exponent */
218  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
219  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
220  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
221  _mm_empty(); /* bye bye mmx */
222 #else
223  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
224  v4sf e = _mm_cvtepi32_ps(emm0);
225 #endif
226 
227  e = _mm_add_ps(e, one);
228 
229  /* part2:
230  if( x < SQRTHF ) {
231  e -= 1;
232  x = x + x - 1.0;
233  } else { x = x - 1.0; }
234  */
235  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
236  v4sf tmp = _mm_and_ps(x, mask);
237  x = _mm_sub_ps(x, one);
238  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
239  x = _mm_add_ps(x, tmp);
240 
241 
242  v4sf z = _mm_mul_ps(x,x);
243 
244  v4sf y = *(v4sf*)_ps_cephes_log_p0;
245  y = _mm_mul_ps(y, x);
246  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
247  y = _mm_mul_ps(y, x);
248  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
249  y = _mm_mul_ps(y, x);
250  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
251  y = _mm_mul_ps(y, x);
252  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
253  y = _mm_mul_ps(y, x);
254  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
255  y = _mm_mul_ps(y, x);
256  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
257  y = _mm_mul_ps(y, x);
258  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
259  y = _mm_mul_ps(y, x);
260  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
261  y = _mm_mul_ps(y, x);
262 
263  y = _mm_mul_ps(y, z);
264 
265 
266  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
267  y = _mm_add_ps(y, tmp);
268 
269 
270  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
271  y = _mm_sub_ps(y, tmp);
272 
273  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
274  x = _mm_add_ps(x, y);
275  x = _mm_add_ps(x, tmp);
276  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN (and NaN as well!)
277  return x;
278 }
279 
280 _PS_CONST(exp_hi, 88.3762626647949f);
281 _PS_CONST(exp_lo, -88.3762626647949f);
282 
283 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
284 _PS_CONST(cephes_exp_C1, 0.693359375);
285 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
286 
287 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
288 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
289 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
290 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
291 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
292 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
293 
294 inline v4sf exp_ps(v4sf x) {
295  v4sf tmp = _mm_setzero_ps(), fx;
296 #ifdef USE_SSE2
297  v4si emm0;
298 #else
299  v2si mm0, mm1;
300 #endif
301  v4sf one = *(v4sf*)_ps_1;
302 
303  // propagate nan
304  v4sf invalid_mask = _mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps());
305 
306 
307  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
308  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
309 
310  /* express exp(x) as exp(g + n*log(2)) */
311  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
312  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
313 
314  /* how to perform a floorf with SSE: just below */
315 #ifndef USE_SSE2
316  /* step 1 : cast to int */
317  tmp = _mm_movehl_ps(tmp, fx);
318  mm0 = _mm_cvttps_pi32(fx);
319  mm1 = _mm_cvttps_pi32(tmp);
320  /* step 2 : cast back to float */
321  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
322 #else
323  emm0 = _mm_cvttps_epi32(fx);
324  tmp = _mm_cvtepi32_ps(emm0);
325 #endif
326  /* if greater, substract 1 */
327  v4sf mask = _mm_cmpgt_ps(tmp, fx);
328  mask = _mm_and_ps(mask, one);
329  fx = _mm_sub_ps(tmp, mask);
330 
331  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
332  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
333  x = _mm_sub_ps(x, tmp);
334  x = _mm_sub_ps(x, z);
335 
336  z = _mm_mul_ps(x,x);
337 
338  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
339  y = _mm_mul_ps(y, x);
340  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
341  y = _mm_mul_ps(y, x);
342  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
343  y = _mm_mul_ps(y, x);
344  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
345  y = _mm_mul_ps(y, x);
346  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
347  y = _mm_mul_ps(y, x);
348  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
349  y = _mm_mul_ps(y, z);
350  y = _mm_add_ps(y, x);
351  y = _mm_add_ps(y, one);
352 
353  /* build 2^n */
354 #ifndef USE_SSE2
355  z = _mm_movehl_ps(z, fx);
356  mm0 = _mm_cvttps_pi32(fx);
357  mm1 = _mm_cvttps_pi32(z);
358  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
359  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
360  mm0 = _mm_slli_pi32(mm0, 23);
361  mm1 = _mm_slli_pi32(mm1, 23);
362 
363  v4sf pow2n;
364  COPY_MM_TO_XMM(mm0, mm1, pow2n);
365  _mm_empty();
366 #else
367  emm0 = _mm_cvttps_epi32(fx);
368  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
369  emm0 = _mm_slli_epi32(emm0, 23);
370  v4sf pow2n = _mm_castsi128_ps(emm0);
371 #endif
372  y = _mm_mul_ps(y, pow2n);
373  y = _mm_or_ps(y, invalid_mask); // propagate NaN
374  return y;
375 }
376 
377 _PS_CONST(minus_cephes_DP1, -0.78515625);
378 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
379 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
380 _PS_CONST(sincof_p0, -1.9515295891E-4);
381 _PS_CONST(sincof_p1, 8.3321608736E-3);
382 _PS_CONST(sincof_p2, -1.6666654611E-1);
383 _PS_CONST(coscof_p0, 2.443315711809948E-005);
384 _PS_CONST(coscof_p1, -1.388731625493765E-003);
385 _PS_CONST(coscof_p2, 4.166664568298827E-002);
386 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
387 
388 
389 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
390  it runs also on old athlons XPs and the pentium III of your grand
391  mother.
392 
393  The code is the exact rewriting of the cephes sinf function.
394  Precision is excellent as long as x < 8192 (I did not bother to
395  take into account the special handling they have for greater values
396  -- it does not return garbage for arguments over 8192, though, but
397  the extra precision is missing).
398 
399  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
400  surprising but correct result.
401 
402  Performance is also surprisingly good, 1.33 times faster than the
403  macos vsinf SSE2 function, and 1.5 times faster than the
404  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
405  too bad for an SSE1 function (with no special tuning) !
406  However the latter libraries probably have a much better handling of NaN,
407  Inf, denormalized and other special arguments..
408 
409  On my core 1 duo, the execution of this function takes approximately 95 cycles.
410 
411  From what I have observed on the experiments with Intel AMath lib, switching to an
412  SSE2 version would improve the perf by only 10%.
413 
414  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
415  deliver full speed.
416 */
417 inline v4sf sin_ps(v4sf x) { // any x
418  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
419 
420 #ifdef USE_SSE2
421  v4si emm0, emm2;
422 #else
423  v2si mm0, mm1, mm2, mm3;
424 #endif
425  sign_bit = x;
426  /* take the absolute value */
427  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
428  /* extract the sign bit (upper one) */
429  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
430 
431  /* scale by 4/Pi */
432  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
433 
434  //printf("plop:"); print4(y);
435 #ifdef USE_SSE2
436  /* store the integer part of y in mm0 */
437  emm2 = _mm_cvttps_epi32(y);
438  /* j=(j+1) & (~1) (see the cephes sources) */
439  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
440  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
441  y = _mm_cvtepi32_ps(emm2);
442  /* get the swap sign flag */
443  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
444  emm0 = _mm_slli_epi32(emm0, 29);
445  /* get the polynom selection mask
446  there is one polynom for 0 <= x <= Pi/4
447  and another one for Pi/4<x<=Pi/2
448 
449  Both branches will be computed.
450  */
451  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
452  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
453 
454  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
455  v4sf poly_mask = _mm_castsi128_ps(emm2);
456  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
457 #else
458  /* store the integer part of y in mm0:mm1 */
459  xmm2 = _mm_movehl_ps(xmm2, y);
460  mm2 = _mm_cvttps_pi32(y);
461  mm3 = _mm_cvttps_pi32(xmm2);
462  /* j=(j+1) & (~1) (see the cephes sources) */
463  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
464  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
465  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
466  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
467  y = _mm_cvtpi32x2_ps(mm2, mm3);
468  /* get the swap sign flag */
469  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
470  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
471  mm0 = _mm_slli_pi32(mm0, 29);
472  mm1 = _mm_slli_pi32(mm1, 29);
473  /* get the polynom selection mask */
474  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
475  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
476  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
477  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
478  v4sf swap_sign_bit, poly_mask;
479  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
480  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
481  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
482  _mm_empty(); /* good-bye mmx */
483 #endif
484 
485  /* The magic pass: "Extended precision modular arithmetic"
486  x = ((x - y * DP1) - y * DP2) - y * DP3; */
487  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
488  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
489  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
490  xmm1 = _mm_mul_ps(y, xmm1);
491  xmm2 = _mm_mul_ps(y, xmm2);
492  xmm3 = _mm_mul_ps(y, xmm3);
493  x = _mm_add_ps(x, xmm1);
494  x = _mm_add_ps(x, xmm2);
495  x = _mm_add_ps(x, xmm3);
496 
497  /* Evaluate the first polynom (0 <= x <= Pi/4) */
498  y = *(v4sf*)_ps_coscof_p0;
499  v4sf z = _mm_mul_ps(x,x);
500 
501  y = _mm_mul_ps(y, z);
502  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
503  y = _mm_mul_ps(y, z);
504  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
505  y = _mm_mul_ps(y, z);
506  y = _mm_mul_ps(y, z);
507  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
508  y = _mm_sub_ps(y, tmp);
509  y = _mm_add_ps(y, *(v4sf*)_ps_1);
510 
511  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
512 
513  v4sf y2 = *(v4sf*)_ps_sincof_p0;
514  y2 = _mm_mul_ps(y2, z);
515  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
516  y2 = _mm_mul_ps(y2, z);
517  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
518  y2 = _mm_mul_ps(y2, z);
519  y2 = _mm_mul_ps(y2, x);
520  y2 = _mm_add_ps(y2, x);
521 
522  /* select the correct result from the two polynoms */
523  xmm3 = poly_mask;
524  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
525  y = _mm_andnot_ps(xmm3, y);
526  y = _mm_add_ps(y,y2);
527  /* update the sign */
528  y = _mm_xor_ps(y, sign_bit);
529 
530  return y;
531 }
532 
533 /* almost the same as sin_ps */
534 inline v4sf cos_ps(v4sf x) { // any x
535  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
536 #ifdef USE_SSE2
537  v4si emm0, emm2;
538 #else
539  v2si mm0, mm1, mm2, mm3;
540 #endif
541  /* take the absolute value */
542  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
543 
544  /* scale by 4/Pi */
545  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
546 
547 #ifdef USE_SSE2
548  /* store the integer part of y in mm0 */
549  emm2 = _mm_cvttps_epi32(y);
550  /* j=(j+1) & (~1) (see the cephes sources) */
551  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
552  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
553  y = _mm_cvtepi32_ps(emm2);
554 
555  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
556 
557  /* get the swap sign flag */
558  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
559  emm0 = _mm_slli_epi32(emm0, 29);
560  /* get the polynom selection mask */
561  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
562  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
563 
564  v4sf sign_bit = _mm_castsi128_ps(emm0);
565  v4sf poly_mask = _mm_castsi128_ps(emm2);
566 #else
567  /* store the integer part of y in mm0:mm1 */
568  xmm2 = _mm_movehl_ps(xmm2, y);
569  mm2 = _mm_cvttps_pi32(y);
570  mm3 = _mm_cvttps_pi32(xmm2);
571 
572  /* j=(j+1) & (~1) (see the cephes sources) */
573  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
574  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
575  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
576  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
577 
578  y = _mm_cvtpi32x2_ps(mm2, mm3);
579 
580 
581  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
582  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
583 
584  /* get the swap sign flag in mm0:mm1 and the
585  polynom selection mask in mm2:mm3 */
586 
587  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
588  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
589  mm0 = _mm_slli_pi32(mm0, 29);
590  mm1 = _mm_slli_pi32(mm1, 29);
591 
592  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
593  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
594 
595  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
596  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
597 
598  v4sf sign_bit, poly_mask;
599  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
600  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
601  _mm_empty(); /* good-bye mmx */
602 #endif
603  /* The magic pass: "Extended precision modular arithmetic"
604  x = ((x - y * DP1) - y * DP2) - y * DP3; */
605  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
606  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
607  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
608  xmm1 = _mm_mul_ps(y, xmm1);
609  xmm2 = _mm_mul_ps(y, xmm2);
610  xmm3 = _mm_mul_ps(y, xmm3);
611  x = _mm_add_ps(x, xmm1);
612  x = _mm_add_ps(x, xmm2);
613  x = _mm_add_ps(x, xmm3);
614 
615  /* Evaluate the first polynom (0 <= x <= Pi/4) */
616  y = *(v4sf*)_ps_coscof_p0;
617  v4sf z = _mm_mul_ps(x,x);
618 
619  y = _mm_mul_ps(y, z);
620  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
621  y = _mm_mul_ps(y, z);
622  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
623  y = _mm_mul_ps(y, z);
624  y = _mm_mul_ps(y, z);
625  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
626  y = _mm_sub_ps(y, tmp);
627  y = _mm_add_ps(y, *(v4sf*)_ps_1);
628 
629  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
630 
631  v4sf y2 = *(v4sf*)_ps_sincof_p0;
632  y2 = _mm_mul_ps(y2, z);
633  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
634  y2 = _mm_mul_ps(y2, z);
635  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
636  y2 = _mm_mul_ps(y2, z);
637  y2 = _mm_mul_ps(y2, x);
638  y2 = _mm_add_ps(y2, x);
639 
640  /* select the correct result from the two polynoms */
641  xmm3 = poly_mask;
642  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
643  y = _mm_andnot_ps(xmm3, y);
644  y = _mm_add_ps(y,y2);
645  /* update the sign */
646  y = _mm_xor_ps(y, sign_bit);
647 
648  return y;
649 }
650 
651 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
652  it is almost as fast, and gives you a free cosine with your sine */
653 inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
654  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
655 #ifdef USE_SSE2
656  v4si emm0, emm2, emm4;
657 #else
658  v2si mm0, mm1, mm2, mm3, mm4, mm5;
659 #endif
660  sign_bit_sin = x;
661  /* take the absolute value */
662  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
663  /* extract the sign bit (upper one) */
664  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
665 
666  /* scale by 4/Pi */
667  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
668 
669 #ifdef USE_SSE2
670  /* store the integer part of y in emm2 */
671  emm2 = _mm_cvttps_epi32(y);
672 
673  /* j=(j+1) & (~1) (see the cephes sources) */
674  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
675  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
676  y = _mm_cvtepi32_ps(emm2);
677 
678  emm4 = emm2;
679 
680  /* get the swap sign flag for the sine */
681  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
682  emm0 = _mm_slli_epi32(emm0, 29);
683  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
684 
685  /* get the polynom selection mask for the sine*/
686  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
687  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
688  v4sf poly_mask = _mm_castsi128_ps(emm2);
689 #else
690  /* store the integer part of y in mm2:mm3 */
691  xmm3 = _mm_movehl_ps(xmm3, y);
692  mm2 = _mm_cvttps_pi32(y);
693  mm3 = _mm_cvttps_pi32(xmm3);
694 
695  /* j=(j+1) & (~1) (see the cephes sources) */
696  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
697  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
698  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
699  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
700 
701  y = _mm_cvtpi32x2_ps(mm2, mm3);
702 
703  mm4 = mm2;
704  mm5 = mm3;
705 
706  /* get the swap sign flag for the sine */
707  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
708  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
709  mm0 = _mm_slli_pi32(mm0, 29);
710  mm1 = _mm_slli_pi32(mm1, 29);
711  v4sf swap_sign_bit_sin;
712  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
713 
714  /* get the polynom selection mask for the sine */
715 
716  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
717  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
718  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
719  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
720  v4sf poly_mask;
721  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
722 #endif
723 
724  /* The magic pass: "Extended precision modular arithmetic"
725  x = ((x - y * DP1) - y * DP2) - y * DP3; */
726  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
727  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
728  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
729  xmm1 = _mm_mul_ps(y, xmm1);
730  xmm2 = _mm_mul_ps(y, xmm2);
731  xmm3 = _mm_mul_ps(y, xmm3);
732  x = _mm_add_ps(x, xmm1);
733  x = _mm_add_ps(x, xmm2);
734  x = _mm_add_ps(x, xmm3);
735 
736 #ifdef USE_SSE2
737  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
738  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
739  emm4 = _mm_slli_epi32(emm4, 29);
740  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
741 #else
742  /* get the sign flag for the cosine */
743  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
744  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
745  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
746  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
747  mm4 = _mm_slli_pi32(mm4, 29);
748  mm5 = _mm_slli_pi32(mm5, 29);
749  v4sf sign_bit_cos;
750  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
751  _mm_empty(); /* good-bye mmx */
752 #endif
753 
754  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
755 
756 
757  /* Evaluate the first polynom (0 <= x <= Pi/4) */
758  v4sf z = _mm_mul_ps(x,x);
759  y = *(v4sf*)_ps_coscof_p0;
760 
761  y = _mm_mul_ps(y, z);
762  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
763  y = _mm_mul_ps(y, z);
764  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
765  y = _mm_mul_ps(y, z);
766  y = _mm_mul_ps(y, z);
767  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
768  y = _mm_sub_ps(y, tmp);
769  y = _mm_add_ps(y, *(v4sf*)_ps_1);
770 
771  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
772 
773  v4sf y2 = *(v4sf*)_ps_sincof_p0;
774  y2 = _mm_mul_ps(y2, z);
775  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
776  y2 = _mm_mul_ps(y2, z);
777  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
778  y2 = _mm_mul_ps(y2, z);
779  y2 = _mm_mul_ps(y2, x);
780  y2 = _mm_add_ps(y2, x);
781 
782  /* select the correct result from the two polynoms */
783  xmm3 = poly_mask;
784  v4sf ysin2 = _mm_and_ps(xmm3, y2);
785  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
786  y2 = _mm_sub_ps(y2,ysin2);
787  y = _mm_sub_ps(y, ysin1);
788 
789  xmm1 = _mm_add_ps(ysin1,ysin2);
790  xmm2 = _mm_add_ps(y,y2);
791 
792  /* update the sign */
793  *s = _mm_xor_ps(xmm1, sign_bit_sin);
794  *c = _mm_xor_ps(xmm2, sign_bit_cos);
795 }
796 
797 #endif
798 
799 #endif // SSE_MATHFUN_H
double double double z
double f[11][100]
double b
Definition: hdecay.h:120
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
double a
Definition: hdecay.h:121
x
Definition: VDTMath.h:216