4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__)
5 #if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) || defined(__clang__)
15 #include <emmintrin.h>
18 #include <pmmintrin.h>
21 #include <smmintrin.h>
63 #include <xmmintrin.h>
68 # define ALIGN16_BEG __declspec(align(16))
72 # define ALIGN16_END __attribute__((aligned(16)))
79 # include <emmintrin.h>
86 #define _PS_CONST(Name, Val) \
87 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
88 #define _PI32_CONST(Name, Val) \
89 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
90 #define _PS_CONST_TYPE(Name, Type, Val) \
91 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
96 _PS_CONST_TYPE(min_norm_pos,
unsigned int, 0x00800000U);
97 _PS_CONST_TYPE(mant_mask,
unsigned int, 0x7f800000U);
98 _PS_CONST_TYPE(inv_mant_mask,
unsigned int, ~0x7f800000U);
100 _PS_CONST_TYPE(sign_mask,
unsigned int, 0x80000000U);
101 _PS_CONST_TYPE(inv_sign_mask,
unsigned int, ~0x80000000U);
104 _PI32_CONST(inv1, ~1);
107 _PI32_CONST(0x7f, 0x7f);
109 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
110 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
111 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
112 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
113 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
114 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
115 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
116 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
117 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
118 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
119 _PS_CONST(cephes_log_q1, -2.12194440
e-4);
120 _PS_CONST(cephes_log_q2, 0.693359375);
122 #if defined (__MINGW32__)
131 inline __m128 my_movehl_ps(__m128
a,
const __m128
b) {
138 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
139 #define _mm_movehl_ps my_movehl_ps
141 inline __m128 my_cmplt_ps(__m128 a,
const __m128 b) {
149 inline __m128 my_cmpgt_ps(__m128 a,
const __m128 b) {
157 inline __m128 my_cmpeq_ps(__m128 a,
const __m128 b) {
165 #warning "redefined _mm_cmpxx_ps functions..."
166 #define _mm_cmplt_ps my_cmplt_ps
167 #define _mm_cmpgt_ps my_cmpgt_ps
168 #define _mm_cmpeq_ps my_cmpeq_ps
172 typedef union xmm_mm_union {
177 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
178 xmm_mm_union u; u.xmm = xmm_; \
183 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
184 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
192 inline v4sf log_ps(v4sf
x) {
198 v4sf one = *(v4sf*)_ps_1;
200 v4sf invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
202 invalid_mask = _mm_or_ps(invalid_mask ,_mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps()));
204 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);
208 COPY_XMM_TO_MM(x, mm0, mm1);
209 mm0 = _mm_srli_pi32(mm0, 23);
210 mm1 = _mm_srli_pi32(mm1, 23);
212 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
215 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
216 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
220 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
221 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
222 v4sf
e = _mm_cvtpi32x2_ps(mm0, mm1);
225 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
226 v4sf e = _mm_cvtepi32_ps(emm0);
229 e = _mm_add_ps(e, one);
237 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
238 v4sf
tmp = _mm_and_ps(x, mask);
239 x = _mm_sub_ps(x, one);
240 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
241 x = _mm_add_ps(x, tmp);
244 v4sf
z = _mm_mul_ps(x,x);
246 v4sf
y = *(v4sf*)_ps_cephes_log_p0;
247 y = _mm_mul_ps(y, x);
248 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
249 y = _mm_mul_ps(y, x);
250 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
251 y = _mm_mul_ps(y, x);
252 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
253 y = _mm_mul_ps(y, x);
254 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
255 y = _mm_mul_ps(y, x);
256 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
257 y = _mm_mul_ps(y, x);
258 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
259 y = _mm_mul_ps(y, x);
260 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
261 y = _mm_mul_ps(y, x);
262 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
263 y = _mm_mul_ps(y, x);
265 y = _mm_mul_ps(y, z);
268 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
269 y = _mm_add_ps(y, tmp);
272 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
273 y = _mm_sub_ps(y, tmp);
275 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
276 x = _mm_add_ps(x, y);
277 x = _mm_add_ps(x, tmp);
278 x = _mm_or_ps(x, invalid_mask);
282 _PS_CONST(exp_hi, 88.3762626647949
f);
283 _PS_CONST(exp_lo, -88.3762626647949
f);
285 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
286 _PS_CONST(cephes_exp_C1, 0.693359375);
287 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
289 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
290 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
291 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
292 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
293 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
294 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
296 inline v4sf exp_ps(v4sf x) {
303 v4sf one = *(v4sf*)_ps_1;
306 v4sf invalid_mask = _mm_cmpeq_ps(_mm_andnot_ps(x, *(v4sf*)_ps_mant_mask),_mm_setzero_ps());
309 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
310 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
313 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
314 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
319 tmp = _mm_setzero_ps()
320 tmp = _mm_movehl_ps(tmp, fx);
321 mm0 = _mm_cvttps_pi32(fx);
322 mm1 = _mm_cvttps_pi32(tmp);
324 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
326 emm0 = _mm_cvttps_epi32(fx);
327 tmp = _mm_cvtepi32_ps(emm0);
330 v4sf mask = _mm_cmpgt_ps(tmp, fx);
331 mask = _mm_and_ps(mask, one);
332 fx = _mm_sub_ps(tmp, mask);
334 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
335 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
336 x = _mm_sub_ps(x, tmp);
337 x = _mm_sub_ps(x, z);
341 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
342 y = _mm_mul_ps(y, x);
343 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
344 y = _mm_mul_ps(y, x);
345 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
346 y = _mm_mul_ps(y, x);
347 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
348 y = _mm_mul_ps(y, x);
349 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
350 y = _mm_mul_ps(y, x);
351 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
352 y = _mm_mul_ps(y, z);
353 y = _mm_add_ps(y, x);
354 y = _mm_add_ps(y, one);
358 z = _mm_movehl_ps(z, fx);
359 mm0 = _mm_cvttps_pi32(fx);
360 mm1 = _mm_cvttps_pi32(z);
361 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
362 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
363 mm0 = _mm_slli_pi32(mm0, 23);
364 mm1 = _mm_slli_pi32(mm1, 23);
367 COPY_MM_TO_XMM(mm0, mm1, pow2n);
370 emm0 = _mm_cvttps_epi32(fx);
371 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
372 emm0 = _mm_slli_epi32(emm0, 23);
373 v4sf pow2n = _mm_castsi128_ps(emm0);
375 y = _mm_mul_ps(y, pow2n);
376 y = _mm_or_ps(y, invalid_mask);
380 _PS_CONST(minus_cephes_DP1, -0.78515625);
381 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
382 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
383 _PS_CONST(sincof_p0, -1.9515295891E-4);
384 _PS_CONST(sincof_p1, 8.3321608736E-3);
385 _PS_CONST(sincof_p2, -1.6666654611E-1);
386 _PS_CONST(coscof_p0, 2.443315711809948E-005);
387 _PS_CONST(coscof_p1, -1.388731625493765E-003);
388 _PS_CONST(coscof_p2, 4.166664568298827E-002);
389 _PS_CONST(cephes_FOPI, 1.27323954473516);
420 inline v4sf sin_ps(v4sf x) {
421 v4sf xmm1, xmm2, xmm3, sign_bit,
y;
426 v2si mm0, mm1, mm2, mm3;
430 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
432 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
435 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
440 emm2 = _mm_cvttps_epi32(y);
442 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
443 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
444 y = _mm_cvtepi32_ps(emm2);
446 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
447 emm0 = _mm_slli_epi32(emm0, 29);
454 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
455 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
457 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
458 v4sf poly_mask = _mm_castsi128_ps(emm2);
459 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
462 xmm2 = _mm_setzero_ps();
463 xmm2 = _mm_movehl_ps(xmm2, y);
464 mm2 = _mm_cvttps_pi32(y);
465 mm3 = _mm_cvttps_pi32(xmm2);
467 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
468 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
469 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
470 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
471 y = _mm_cvtpi32x2_ps(mm2, mm3);
473 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
474 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
475 mm0 = _mm_slli_pi32(mm0, 29);
476 mm1 = _mm_slli_pi32(mm1, 29);
478 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
479 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
480 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
481 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
482 v4sf swap_sign_bit, poly_mask;
483 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
484 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
485 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
491 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
492 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
493 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
494 xmm1 = _mm_mul_ps(y, xmm1);
495 xmm2 = _mm_mul_ps(y, xmm2);
496 xmm3 = _mm_mul_ps(y, xmm3);
497 x = _mm_add_ps(x, xmm1);
498 x = _mm_add_ps(x, xmm2);
499 x = _mm_add_ps(x, xmm3);
502 y = *(v4sf*)_ps_coscof_p0;
503 v4sf z = _mm_mul_ps(x,x);
505 y = _mm_mul_ps(y, z);
506 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
507 y = _mm_mul_ps(y, z);
508 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
509 y = _mm_mul_ps(y, z);
510 y = _mm_mul_ps(y, z);
511 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
512 y = _mm_sub_ps(y, tmp);
513 y = _mm_add_ps(y, *(v4sf*)_ps_1);
517 v4sf y2 = *(v4sf*)_ps_sincof_p0;
518 y2 = _mm_mul_ps(y2, z);
519 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
520 y2 = _mm_mul_ps(y2, z);
521 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
522 y2 = _mm_mul_ps(y2, z);
523 y2 = _mm_mul_ps(y2, x);
524 y2 = _mm_add_ps(y2, x);
528 y2 = _mm_and_ps(xmm3, y2);
529 y = _mm_andnot_ps(xmm3, y);
530 y = _mm_add_ps(y,y2);
532 y = _mm_xor_ps(y, sign_bit);
538 inline v4sf cos_ps(v4sf x) {
539 v4sf xmm1, xmm2, xmm3,
y;
543 v2si mm0, mm1, mm2, mm3;
546 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
549 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
553 emm2 = _mm_cvttps_epi32(y);
555 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
556 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
557 y = _mm_cvtepi32_ps(emm2);
559 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
562 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
563 emm0 = _mm_slli_epi32(emm0, 29);
565 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
566 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
568 v4sf sign_bit = _mm_castsi128_ps(emm0);
569 v4sf poly_mask = _mm_castsi128_ps(emm2);
572 xmm2 = _mm_setzero_ps();
573 xmm2 = _mm_movehl_ps(xmm2, y);
574 mm2 = _mm_cvttps_pi32(y);
575 mm3 = _mm_cvttps_pi32(xmm2);
578 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
579 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
580 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
581 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
583 y = _mm_cvtpi32x2_ps(mm2, mm3);
586 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
587 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
592 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
593 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
594 mm0 = _mm_slli_pi32(mm0, 29);
595 mm1 = _mm_slli_pi32(mm1, 29);
597 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
598 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
600 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
601 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
603 v4sf sign_bit, poly_mask;
604 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
605 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
610 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
611 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
612 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
613 xmm1 = _mm_mul_ps(y, xmm1);
614 xmm2 = _mm_mul_ps(y, xmm2);
615 xmm3 = _mm_mul_ps(y, xmm3);
616 x = _mm_add_ps(x, xmm1);
617 x = _mm_add_ps(x, xmm2);
618 x = _mm_add_ps(x, xmm3);
621 y = *(v4sf*)_ps_coscof_p0;
622 v4sf z = _mm_mul_ps(x,x);
624 y = _mm_mul_ps(y, z);
625 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
626 y = _mm_mul_ps(y, z);
627 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
628 y = _mm_mul_ps(y, z);
629 y = _mm_mul_ps(y, z);
630 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
631 y = _mm_sub_ps(y, tmp);
632 y = _mm_add_ps(y, *(v4sf*)_ps_1);
636 v4sf y2 = *(v4sf*)_ps_sincof_p0;
637 y2 = _mm_mul_ps(y2, z);
638 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
639 y2 = _mm_mul_ps(y2, z);
640 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
641 y2 = _mm_mul_ps(y2, z);
642 y2 = _mm_mul_ps(y2, x);
643 y2 = _mm_add_ps(y2, x);
647 y2 = _mm_and_ps(xmm3, y2);
648 y = _mm_andnot_ps(xmm3, y);
649 y = _mm_add_ps(y,y2);
651 y = _mm_xor_ps(y, sign_bit);
658 inline void sincos_ps(v4sf x, v4sf *
s, v4sf *
c) {
659 v4sf xmm1, xmm2, xmm3, sign_bit_sin,
y;
661 v4si emm0, emm2, emm4;
663 v2si mm0, mm1, mm2, mm3, mm4, mm5;
667 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
669 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
672 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
676 emm2 = _mm_cvttps_epi32(y);
679 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
680 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
681 y = _mm_cvtepi32_ps(emm2);
686 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
687 emm0 = _mm_slli_epi32(emm0, 29);
688 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
691 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
692 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
693 v4sf poly_mask = _mm_castsi128_ps(emm2);
696 xmm3 = _mm_setzero_ps();
697 xmm3 = _mm_movehl_ps(xmm3, y);
698 mm2 = _mm_cvttps_pi32(y);
699 mm3 = _mm_cvttps_pi32(xmm3);
702 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
703 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
704 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
705 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
707 y = _mm_cvtpi32x2_ps(mm2, mm3);
713 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
714 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
715 mm0 = _mm_slli_pi32(mm0, 29);
716 mm1 = _mm_slli_pi32(mm1, 29);
717 v4sf swap_sign_bit_sin;
718 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
722 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
723 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
724 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
725 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
727 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
732 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
733 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
734 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
735 xmm1 = _mm_mul_ps(y, xmm1);
736 xmm2 = _mm_mul_ps(y, xmm2);
737 xmm3 = _mm_mul_ps(y, xmm3);
738 x = _mm_add_ps(x, xmm1);
739 x = _mm_add_ps(x, xmm2);
740 x = _mm_add_ps(x, xmm3);
743 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
744 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
745 emm4 = _mm_slli_epi32(emm4, 29);
746 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
749 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
750 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
751 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
752 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
753 mm4 = _mm_slli_pi32(mm4, 29);
754 mm5 = _mm_slli_pi32(mm5, 29);
756 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
760 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
764 v4sf z = _mm_mul_ps(x,x);
765 y = *(v4sf*)_ps_coscof_p0;
767 y = _mm_mul_ps(y, z);
768 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
769 y = _mm_mul_ps(y, z);
770 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
771 y = _mm_mul_ps(y, z);
772 y = _mm_mul_ps(y, z);
773 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
774 y = _mm_sub_ps(y, tmp);
775 y = _mm_add_ps(y, *(v4sf*)_ps_1);
779 v4sf y2 = *(v4sf*)_ps_sincof_p0;
780 y2 = _mm_mul_ps(y2, z);
781 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
782 y2 = _mm_mul_ps(y2, z);
783 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
784 y2 = _mm_mul_ps(y2, z);
785 y2 = _mm_mul_ps(y2, x);
786 y2 = _mm_add_ps(y2, x);
790 v4sf ysin2 = _mm_and_ps(xmm3, y2);
791 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
792 y2 = _mm_sub_ps(y2,ysin2);
793 y = _mm_sub_ps(y, ysin1);
795 xmm1 = _mm_add_ps(ysin1,ysin2);
796 xmm2 = _mm_add_ps(y,y2);
799 *s = _mm_xor_ps(xmm1, sign_bit_sin);
800 *c = _mm_xor_ps(xmm2, sign_bit_cos);
805 #endif // SSE_MATHFUN_H
std::vector< std::vector< double > > tmp