4 #if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) || defined(__clang__)
14 #include <emmintrin.h>
17 #include <pmmintrin.h>
20 #include <smmintrin.h>
61 #include <xmmintrin.h>
66 # define ALIGN16_BEG __declspec(align(16))
70 # define ALIGN16_END __attribute__((aligned(16)))
77 # include <emmintrin.h>
84 #define _PS_CONST(Name, Val) \
85 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
86 #define _PI32_CONST(Name, Val) \
87 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
88 #define _PS_CONST_TYPE(Name, Type, Val) \
89 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
94 _PS_CONST_TYPE(min_norm_pos,
unsigned int, 0x00800000U);
95 _PS_CONST_TYPE(mant_mask,
unsigned int, 0x7f800000U);
96 _PS_CONST_TYPE(inv_mant_mask,
unsigned int, ~0x7f800000U);
98 _PS_CONST_TYPE(sign_mask,
unsigned int, 0x80000000U);
99 _PS_CONST_TYPE(inv_sign_mask,
unsigned int, ~0x80000000U);
102 _PI32_CONST(inv1, ~1);
105 _PI32_CONST(0x7f, 0x7f);
107 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
108 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
109 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
110 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
111 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
112 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
113 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
114 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
115 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
116 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
117 _PS_CONST(cephes_log_q1, -2.12194440
e-4);
118 _PS_CONST(cephes_log_q2, 0.693359375);
120 #if defined (__MINGW32__)
129 inline __m128 my_movehl_ps(__m128
a,
const __m128
b) {
136 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
137 #define _mm_movehl_ps my_movehl_ps
139 inline __m128 my_cmplt_ps(__m128 a,
const __m128 b) {
147 inline __m128 my_cmpgt_ps(__m128 a,
const __m128 b) {
155 inline __m128 my_cmpeq_ps(__m128 a,
const __m128 b) {
163 #warning "redefined _mm_cmpxx_ps functions..."
164 #define _mm_cmplt_ps my_cmplt_ps
165 #define _mm_cmpgt_ps my_cmpgt_ps
166 #define _mm_cmpeq_ps my_cmpeq_ps
170 typedef union xmm_mm_union {
175 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
176 xmm_mm_union u; u.xmm = xmm_; \
181 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
182 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
190 inline v4sf log_ps(v4sf
x) {
196 v4sf one = *(v4sf*)_ps_1;
198 v4sf invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
200 invalid_mask = _mm_or_ps(invalid_mask ,_mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps()));
202 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);
206 COPY_XMM_TO_MM(x, mm0, mm1);
207 mm0 = _mm_srli_pi32(mm0, 23);
208 mm1 = _mm_srli_pi32(mm1, 23);
210 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
213 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
214 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
218 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
219 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
220 v4sf
e = _mm_cvtpi32x2_ps(mm0, mm1);
223 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
224 v4sf e = _mm_cvtepi32_ps(emm0);
227 e = _mm_add_ps(e, one);
235 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
236 v4sf
tmp = _mm_and_ps(x, mask);
237 x = _mm_sub_ps(x, one);
238 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
239 x = _mm_add_ps(x, tmp);
242 v4sf
z = _mm_mul_ps(x,x);
244 v4sf
y = *(v4sf*)_ps_cephes_log_p0;
245 y = _mm_mul_ps(y, x);
246 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
247 y = _mm_mul_ps(y, x);
248 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
249 y = _mm_mul_ps(y, x);
250 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
251 y = _mm_mul_ps(y, x);
252 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
253 y = _mm_mul_ps(y, x);
254 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
255 y = _mm_mul_ps(y, x);
256 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
257 y = _mm_mul_ps(y, x);
258 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
259 y = _mm_mul_ps(y, x);
260 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
261 y = _mm_mul_ps(y, x);
263 y = _mm_mul_ps(y, z);
266 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
267 y = _mm_add_ps(y, tmp);
270 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
271 y = _mm_sub_ps(y, tmp);
273 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
274 x = _mm_add_ps(x, y);
275 x = _mm_add_ps(x, tmp);
276 x = _mm_or_ps(x, invalid_mask);
280 _PS_CONST(exp_hi, 88.3762626647949
f);
281 _PS_CONST(exp_lo, -88.3762626647949
f);
283 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
284 _PS_CONST(cephes_exp_C1, 0.693359375);
285 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
287 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
288 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
289 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
290 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
291 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
292 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
294 inline v4sf exp_ps(v4sf x) {
295 v4sf tmp = _mm_setzero_ps(), fx;
301 v4sf one = *(v4sf*)_ps_1;
304 v4sf invalid_mask = _mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps());
307 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
308 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
311 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
312 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
317 tmp = _mm_movehl_ps(tmp, fx);
318 mm0 = _mm_cvttps_pi32(fx);
319 mm1 = _mm_cvttps_pi32(tmp);
321 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
323 emm0 = _mm_cvttps_epi32(fx);
324 tmp = _mm_cvtepi32_ps(emm0);
327 v4sf mask = _mm_cmpgt_ps(tmp, fx);
328 mask = _mm_and_ps(mask, one);
329 fx = _mm_sub_ps(tmp, mask);
331 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
332 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
333 x = _mm_sub_ps(x, tmp);
334 x = _mm_sub_ps(x, z);
338 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
339 y = _mm_mul_ps(y, x);
340 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
341 y = _mm_mul_ps(y, x);
342 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
343 y = _mm_mul_ps(y, x);
344 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
345 y = _mm_mul_ps(y, x);
346 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
347 y = _mm_mul_ps(y, x);
348 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
349 y = _mm_mul_ps(y, z);
350 y = _mm_add_ps(y, x);
351 y = _mm_add_ps(y, one);
355 z = _mm_movehl_ps(z, fx);
356 mm0 = _mm_cvttps_pi32(fx);
357 mm1 = _mm_cvttps_pi32(z);
358 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
359 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
360 mm0 = _mm_slli_pi32(mm0, 23);
361 mm1 = _mm_slli_pi32(mm1, 23);
364 COPY_MM_TO_XMM(mm0, mm1, pow2n);
367 emm0 = _mm_cvttps_epi32(fx);
368 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
369 emm0 = _mm_slli_epi32(emm0, 23);
370 v4sf pow2n = _mm_castsi128_ps(emm0);
372 y = _mm_mul_ps(y, pow2n);
373 y = _mm_or_ps(y, invalid_mask);
377 _PS_CONST(minus_cephes_DP1, -0.78515625);
378 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
379 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
380 _PS_CONST(sincof_p0, -1.9515295891E-4);
381 _PS_CONST(sincof_p1, 8.3321608736E-3);
382 _PS_CONST(sincof_p2, -1.6666654611E-1);
383 _PS_CONST(coscof_p0, 2.443315711809948E-005);
384 _PS_CONST(coscof_p1, -1.388731625493765E-003);
385 _PS_CONST(coscof_p2, 4.166664568298827E-002);
386 _PS_CONST(cephes_FOPI, 1.27323954473516);
417 inline v4sf sin_ps(v4sf x) {
418 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit,
y;
423 v2si mm0, mm1, mm2, mm3;
427 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
429 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
432 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
437 emm2 = _mm_cvttps_epi32(y);
439 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
440 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
441 y = _mm_cvtepi32_ps(emm2);
443 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
444 emm0 = _mm_slli_epi32(emm0, 29);
451 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
452 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
454 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
455 v4sf poly_mask = _mm_castsi128_ps(emm2);
456 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
459 xmm2 = _mm_movehl_ps(xmm2, y);
460 mm2 = _mm_cvttps_pi32(y);
461 mm3 = _mm_cvttps_pi32(xmm2);
463 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
464 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
465 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
466 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
467 y = _mm_cvtpi32x2_ps(mm2, mm3);
469 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
470 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
471 mm0 = _mm_slli_pi32(mm0, 29);
472 mm1 = _mm_slli_pi32(mm1, 29);
474 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
475 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
476 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
477 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
478 v4sf swap_sign_bit, poly_mask;
479 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
480 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
481 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
487 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
488 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
489 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
490 xmm1 = _mm_mul_ps(y, xmm1);
491 xmm2 = _mm_mul_ps(y, xmm2);
492 xmm3 = _mm_mul_ps(y, xmm3);
493 x = _mm_add_ps(x, xmm1);
494 x = _mm_add_ps(x, xmm2);
495 x = _mm_add_ps(x, xmm3);
498 y = *(v4sf*)_ps_coscof_p0;
499 v4sf z = _mm_mul_ps(x,x);
501 y = _mm_mul_ps(y, z);
502 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
503 y = _mm_mul_ps(y, z);
504 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
505 y = _mm_mul_ps(y, z);
506 y = _mm_mul_ps(y, z);
507 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
508 y = _mm_sub_ps(y, tmp);
509 y = _mm_add_ps(y, *(v4sf*)_ps_1);
513 v4sf y2 = *(v4sf*)_ps_sincof_p0;
514 y2 = _mm_mul_ps(y2, z);
515 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
516 y2 = _mm_mul_ps(y2, z);
517 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
518 y2 = _mm_mul_ps(y2, z);
519 y2 = _mm_mul_ps(y2, x);
520 y2 = _mm_add_ps(y2, x);
524 y2 = _mm_and_ps(xmm3, y2);
525 y = _mm_andnot_ps(xmm3, y);
526 y = _mm_add_ps(y,y2);
528 y = _mm_xor_ps(y, sign_bit);
534 inline v4sf cos_ps(v4sf x) {
535 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3,
y;
539 v2si mm0, mm1, mm2, mm3;
542 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
545 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
549 emm2 = _mm_cvttps_epi32(y);
551 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
552 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
553 y = _mm_cvtepi32_ps(emm2);
555 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
558 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
559 emm0 = _mm_slli_epi32(emm0, 29);
561 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
562 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
564 v4sf sign_bit = _mm_castsi128_ps(emm0);
565 v4sf poly_mask = _mm_castsi128_ps(emm2);
568 xmm2 = _mm_movehl_ps(xmm2, y);
569 mm2 = _mm_cvttps_pi32(y);
570 mm3 = _mm_cvttps_pi32(xmm2);
573 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
574 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
575 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
576 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
578 y = _mm_cvtpi32x2_ps(mm2, mm3);
581 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
582 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
587 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
588 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
589 mm0 = _mm_slli_pi32(mm0, 29);
590 mm1 = _mm_slli_pi32(mm1, 29);
592 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
593 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
595 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
596 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
598 v4sf sign_bit, poly_mask;
599 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
600 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
605 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
606 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
607 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
608 xmm1 = _mm_mul_ps(y, xmm1);
609 xmm2 = _mm_mul_ps(y, xmm2);
610 xmm3 = _mm_mul_ps(y, xmm3);
611 x = _mm_add_ps(x, xmm1);
612 x = _mm_add_ps(x, xmm2);
613 x = _mm_add_ps(x, xmm3);
616 y = *(v4sf*)_ps_coscof_p0;
617 v4sf z = _mm_mul_ps(x,x);
619 y = _mm_mul_ps(y, z);
620 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
621 y = _mm_mul_ps(y, z);
622 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
623 y = _mm_mul_ps(y, z);
624 y = _mm_mul_ps(y, z);
625 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
626 y = _mm_sub_ps(y, tmp);
627 y = _mm_add_ps(y, *(v4sf*)_ps_1);
631 v4sf y2 = *(v4sf*)_ps_sincof_p0;
632 y2 = _mm_mul_ps(y2, z);
633 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
634 y2 = _mm_mul_ps(y2, z);
635 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
636 y2 = _mm_mul_ps(y2, z);
637 y2 = _mm_mul_ps(y2, x);
638 y2 = _mm_add_ps(y2, x);
642 y2 = _mm_and_ps(xmm3, y2);
643 y = _mm_andnot_ps(xmm3, y);
644 y = _mm_add_ps(y,y2);
646 y = _mm_xor_ps(y, sign_bit);
653 inline void sincos_ps(v4sf x, v4sf *
s, v4sf *
c) {
654 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin,
y;
656 v4si emm0, emm2, emm4;
658 v2si mm0, mm1, mm2, mm3, mm4, mm5;
662 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
664 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
667 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
671 emm2 = _mm_cvttps_epi32(y);
674 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
675 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
676 y = _mm_cvtepi32_ps(emm2);
681 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
682 emm0 = _mm_slli_epi32(emm0, 29);
683 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
686 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
687 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
688 v4sf poly_mask = _mm_castsi128_ps(emm2);
691 xmm3 = _mm_movehl_ps(xmm3, y);
692 mm2 = _mm_cvttps_pi32(y);
693 mm3 = _mm_cvttps_pi32(xmm3);
696 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
697 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
698 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
699 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
701 y = _mm_cvtpi32x2_ps(mm2, mm3);
707 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
708 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
709 mm0 = _mm_slli_pi32(mm0, 29);
710 mm1 = _mm_slli_pi32(mm1, 29);
711 v4sf swap_sign_bit_sin;
712 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
716 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
717 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
718 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
719 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
721 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
726 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
727 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
728 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
729 xmm1 = _mm_mul_ps(y, xmm1);
730 xmm2 = _mm_mul_ps(y, xmm2);
731 xmm3 = _mm_mul_ps(y, xmm3);
732 x = _mm_add_ps(x, xmm1);
733 x = _mm_add_ps(x, xmm2);
734 x = _mm_add_ps(x, xmm3);
737 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
738 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
739 emm4 = _mm_slli_epi32(emm4, 29);
740 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
743 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
744 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
745 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
746 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
747 mm4 = _mm_slli_pi32(mm4, 29);
748 mm5 = _mm_slli_pi32(mm5, 29);
750 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
754 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
758 v4sf z = _mm_mul_ps(x,x);
759 y = *(v4sf*)_ps_coscof_p0;
761 y = _mm_mul_ps(y, z);
762 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
763 y = _mm_mul_ps(y, z);
764 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
765 y = _mm_mul_ps(y, z);
766 y = _mm_mul_ps(y, z);
767 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
768 y = _mm_sub_ps(y, tmp);
769 y = _mm_add_ps(y, *(v4sf*)_ps_1);
773 v4sf y2 = *(v4sf*)_ps_sincof_p0;
774 y2 = _mm_mul_ps(y2, z);
775 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
776 y2 = _mm_mul_ps(y2, z);
777 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
778 y2 = _mm_mul_ps(y2, z);
779 y2 = _mm_mul_ps(y2, x);
780 y2 = _mm_add_ps(y2, x);
784 v4sf ysin2 = _mm_and_ps(xmm3, y2);
785 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
786 y2 = _mm_sub_ps(y2,ysin2);
787 y = _mm_sub_ps(y, ysin1);
789 xmm1 = _mm_add_ps(ysin1,ysin2);
790 xmm2 = _mm_add_ps(y,y2);
793 *s = _mm_xor_ps(xmm1, sign_bit_sin);
794 *c = _mm_xor_ps(xmm2, sign_bit_cos);
799 #endif // SSE_MATHFUN_H
std::vector< std::vector< double > > tmp