Go to the documentation of this file.00001 #ifndef vdt_math_h
00002 #define vdt_math_h
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include <iostream>
00020 #include <cmath>
00021 #include <limits>
00022
00023
00024 #define CMS_VECTORIZE
00025
00026 #define CMS_VECTORIZE_VERBOSE
00027
00028 #define VDT_RESTRICT
00029
00030 #define VDT_FORCE_INLINE __attribute__((always_inline)) inline
00031
00032 namespace vdt {
00033
00034
00035 constexpr double LOG2E = 1.4426950408889634073599;
00036 constexpr double SQRTH = 0.70710678118654752440;
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 constexpr double EXP_LIMIT = 708.;
00047 constexpr double PX1exp = 1.26177193074810590878E-4;
00048 constexpr double PX2exp = 3.02994407707441961300E-2;
00049 constexpr double PX3exp = 9.99999999999999999910E-1;
00050 constexpr double QX1exp = 3.00198505138664455042E-6;
00051 constexpr double QX2exp = 2.52448340349684104192E-3;
00052 constexpr double QX3exp = 2.27265548208155028766E-1;
00053 constexpr double QX4exp = 2.00000000000000000009E0;
00054
00055
00056 constexpr double LOG_UPPER_LIMIT = 1e307;
00057 constexpr double LOG_LOWER_LIMIT = 1e-307;
00058 constexpr double PX1log = 1.01875663804580931796E-4;
00059 constexpr double PX2log = 4.97494994976747001425E-1;
00060 constexpr double PX3log = 4.70579119878881725854E0;
00061 constexpr double PX4log = 1.44989225341610930846E1;
00062 constexpr double PX5log = 1.79368678507819816313E1;
00063 constexpr double PX6log = 7.70838733755885391666E0;
00064
00065 constexpr double QX1log = 1.12873587189167450590E1;
00066 constexpr double QX2log = 4.52279145837532221105E1;
00067 constexpr double QX3log = 8.29875266912776603211E1;
00068 constexpr double QX4log = 7.11544750618563894466E1;
00069 constexpr double QX5log = 2.31251620126765340583E1;
00070
00071
00072 constexpr double DP1sc = 7.85398125648498535156E-1;
00073 constexpr double DP2sc = 3.77489470793079817668E-8;
00074 constexpr double DP3sc = 2.69515142907905952645E-15;
00075 constexpr double TWOPI = 2.*M_PI;
00076 constexpr double PI = M_PI;
00077 constexpr double PIO2 = M_PI_2;
00078 constexpr double PIO4 = M_PI_4;
00079
00080
00081 constexpr double SIN_UPPER_LIMIT = TWOPI;
00082 constexpr double SIN_LOWER_LIMIT = -SIN_UPPER_LIMIT;
00083 constexpr double C1sin = 1.58962301576546568060E-10;
00084 constexpr double C2sin =-2.50507477628578072866E-8;
00085 constexpr double C3sin = 2.75573136213857245213E-6;
00086 constexpr double C4sin =-1.98412698295895385996E-4;
00087 constexpr double C5sin = 8.33333333332211858878E-3;
00088 constexpr double C6sin =-1.66666666666666307295E-1;
00089
00090
00091 constexpr double C1cos =-1.13585365213876817300E-11;
00092 constexpr double C2cos = 2.08757008419747316778E-9;
00093 constexpr double C3cos =-2.75573141792967388112E-7;
00094 constexpr double C4cos = 2.48015872888517045348E-5;
00095 constexpr double C5cos =-1.38888888888730564116E-3;
00096 constexpr double C6cos = 4.16666666666665929218E-2;
00097
00098
00099
00100 constexpr double RX1asin = 2.967721961301243206100E-3;
00101 constexpr double RX2asin = -5.634242780008963776856E-1;
00102 constexpr double RX3asin = 6.968710824104713396794E0;
00103 constexpr double RX4asin = -2.556901049652824852289E1;
00104 constexpr double RX5asin = 2.853665548261061424989E1;
00105
00106 constexpr double SX1asin = -2.194779531642920639778E1;
00107 constexpr double SX2asin = 1.470656354026814941758E2;
00108 constexpr double SX3asin = -3.838770957603691357202E2;
00109 constexpr double SX4asin = 3.424398657913078477438E2;
00110
00111 constexpr double PX1asin = 4.253011369004428248960E-3;
00112 constexpr double PX2asin = -6.019598008014123785661E-1;
00113 constexpr double PX3asin = 5.444622390564711410273E0;
00114 constexpr double PX4asin = -1.626247967210700244449E1;
00115 constexpr double PX5asin = 1.956261983317594739197E1;
00116 constexpr double PX6asin = -8.198089802484824371615E0;
00117
00118 constexpr double QX1asin = -1.474091372988853791896E1;
00119 constexpr double QX2asin = 7.049610280856842141659E1;
00120 constexpr double QX3asin = -1.471791292232726029859E2;
00121 constexpr double QX4asin = 1.395105614657485689735E2;
00122 constexpr double QX5asin = -4.918853881490881290097E1;
00123
00124
00125 constexpr double PX1tan=-1.30936939181383777646E4;
00126 constexpr double PX2tan=1.15351664838587416140E6;
00127 constexpr double PX3tan=-1.79565251976484877988E7;
00128
00129 constexpr double QX1tan = 1.36812963470692954678E4;
00130 constexpr double QX2tan = -1.32089234440210967447E6;
00131 constexpr double QX3tan = 2.50083801823357915839E7;
00132 constexpr double QX4tan = -5.38695755929454629881E7;
00133
00134 constexpr double DP1tan = 7.853981554508209228515625E-1;
00135 constexpr double DP2tan = 7.94662735614792836714E-9;
00136 constexpr double DP3tan = 3.06161699786838294307E-17;
00137 constexpr double TAN_LIMIT = TWOPI;
00138
00139
00140 constexpr double T3PO8 = 2.41421356237309504880;
00141 constexpr double MOREBITS = 6.123233995736765886130E-17;
00142 constexpr double MOREBITSO2 = MOREBITS/2.;
00143
00144 constexpr double PX1atan = -8.750608600031904122785E-1;
00145 constexpr double PX2atan = -1.615753718733365076637E1;
00146 constexpr double PX3atan = -7.500855792314704667340E1;
00147 constexpr double PX4atan = -1.228866684490136173410E2;
00148 constexpr double PX5atan = -6.485021904942025371773E1;
00149
00150 constexpr double QX1atan = - 2.485846490142306297962E1;
00151 constexpr double QX2atan = 1.650270098316988542046E2;
00152 constexpr double QX3atan = 4.328810604912902668951E2;
00153 constexpr double QX4atan = 4.853903996359136964868E2;
00154 constexpr double QX5atan = 1.945506571482613964425E2;
00155
00156 constexpr double ATAN_LIMIT = 1e307;
00157
00158
00159
00160 constexpr double SQRT_LIMIT = 1e307;
00161
00162
00164 void print_instructions_info();
00165
00166
00167
00169 typedef union {
00170 double d;
00171 int i[2];
00172 long long ll;
00173 unsigned short s[4];
00174 } ieee754;
00175
00176
00177
00179 VDT_FORCE_INLINE double ll2d(unsigned long long x) {
00180 ieee754 tmp;
00181 tmp.ll=x;
00182 return tmp.d;
00183 }
00184
00185
00186
00188 VDT_FORCE_INLINE unsigned long long d2ll(double x) {
00189 ieee754 tmp;
00190 tmp.d=x;
00191 return tmp.ll;
00192 }
00193
00194
00196 VDT_FORCE_INLINE double getMantExponent(double x, double& fe){
00197
00198 unsigned long long n = d2ll(x);
00199
00200
00201
00202 unsigned long long le = ((n >> 52) & 0x7ffLL);
00203
00204
00205 int e = le;
00206 fe = (e-1023) +1 ;
00207
00208
00209
00210 n &=0xfffffffffffffLL;
00211
00212
00213
00214 const unsigned long long p05 = d2ll(0.5);
00215 n |= p05;
00216 x = ll2d(n);
00217 return x;
00218 }
00219
00220
00221
00222
00223
00224
00226 VDT_FORCE_INLINE double fast_exp(double x){
00227
00228 double initial_x = x;
00229
00230
00231 double px = std::floor(LOG2E * x + 0.5);
00232
00233 int n = px;
00234
00235 x -= px * 6.93145751953125E-1;
00236 x -= px * 1.42860682030941723212E-6;
00237
00238 double xx = x * x;
00239
00240
00241 px = PX1exp;
00242 px *= xx;
00243 px += PX2exp;
00244 px *= xx;
00245 px += PX3exp;
00246 px *= x;
00247
00248
00249 double qx = QX1exp;
00250 qx *= xx;
00251 qx += QX2exp;
00252 qx *= xx;
00253 qx += QX3exp;
00254 qx *= xx;
00255 qx += QX4exp;
00256
00257
00258 x = px / (qx - px);
00259 x = 1.0 + 2.0 * x;
00260
00261
00262 ieee754 u;
00263 u.d = 0;
00264 n += 1023;
00265 u.ll = (long long) (n) << 52;
00266
00267 double res = x * u.d;
00268 if (initial_x > EXP_LIMIT)
00269 res = std::numeric_limits<double>::infinity();
00270 if (initial_x < -EXP_LIMIT)
00271 res = 0.;
00272
00273 return res;
00274 }
00275
00276
00277
00278
00279 VDT_FORCE_INLINE double fast_log(double x){
00280
00281 double input_x=x;
00282
00283
00284 double fe;
00285 x = getMantExponent(x,fe);
00286
00287
00288 if( x < SQRTH ) {
00289 fe-=1;
00290 x += x ;
00291 }
00292 x -= 1.0;
00293
00294
00295
00296 double z = x*x;
00297 double px = PX1log;
00298 px *= x;
00299 px += PX2log;
00300 px *= x;
00301 px += PX3log;
00302 px *= x;
00303 px += PX4log;
00304 px *= x;
00305 px += PX5log;
00306 px *= x;
00307 px += PX6log;
00308
00309
00310 px *= x;
00311 px *= z;
00312
00313 double qx = x;
00314 qx += QX1log;
00315 qx *=x;
00316 qx += QX2log;
00317 qx *=x;
00318 qx += QX3log;
00319 qx *=x;
00320 qx += QX4log;
00321 qx *=x;
00322 qx += QX5log;
00323
00324 double y = px / qx ;
00325
00326 y -= fe * 2.121944400546905827679e-4;
00327 y -= 0.5 * z ;
00328
00329 z = x + y;
00330 z += fe * 0.693359375;
00331
00332 if (input_x > LOG_UPPER_LIMIT)
00333 z = std::numeric_limits<double>::infinity();
00334 if (input_x < LOG_LOWER_LIMIT)
00335 z = - std::numeric_limits<double>::infinity();
00336
00337
00338
00339 return( z );
00340
00341 }
00342
00343
00345 VDT_FORCE_INLINE double fast_sin(double x){
00346
00347 int sign = 1;
00348
00349 if (x < 0){
00350 x = - x;
00351 sign = -1;
00352 }
00353
00354 if( x > PI ){
00355 x = TWOPI - x;
00356 sign = - sign;
00357 }
00358
00359 if( x > PIO2 )
00360 x = PI - x ;
00361
00362 double y = int( x/PIO4 );
00363
00364 int j=0;
00365 if (x>PIO4){
00366 j=2;
00367 y+=1;
00368 }
00369
00370
00371 double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
00372
00373 double zz = z * z;
00374
00375 double px=0;
00376
00377 if( j==2 ){
00378 px = C1cos;
00379 px *= zz;
00380 px += C2cos;
00381 px *= zz;
00382 px += C3cos;
00383 px *= zz;
00384 px += C4cos;
00385 px *= zz;
00386 px += C5cos;
00387 px *= zz;
00388 px += C6cos;
00389 y = 1.0 - zz * .5 + zz * zz * px;
00390 }
00391 else{
00392 px = C1sin;
00393 px *= zz;
00394 px += C2sin;
00395 px *= zz;
00396 px += C3sin;
00397 px *= zz;
00398 px += C4sin;
00399 px *= zz;
00400 px += C5sin;
00401 px *= zz;
00402 px += C6sin;
00403 y = z + z * zz * px;
00404 }
00405
00406 y *= sign;
00407
00408 return y;
00409 }
00410
00411
00412
00413 VDT_FORCE_INLINE double fast_asin(double x){
00414
00415 int sign=1;
00416 double a = x;
00417
00418 if ( x < 0. ){
00419 sign *= -1;
00420 a *= -1;
00421 }
00422
00423 double p, z, zz;
00424 double px,qx;
00425
00426
00427 zz = 1.0 - a;
00428
00429 px = RX1asin;
00430 px*= zz;
00431 px+= RX2asin;
00432 px*= zz;
00433 px+= RX3asin;
00434 px*= zz;
00435 px+= RX4asin;
00436 px*= zz;
00437 px+= RX5asin;
00438
00439 qx = zz;
00440 qx+= SX1asin;
00441 qx*= zz;
00442 qx+= SX2asin;
00443 qx*= zz;
00444 qx+= SX3asin;
00445 qx*= zz;
00446 qx+= SX4asin;
00447
00448 p =zz* px/qx;
00449
00450
00451
00452 zz = std::sqrt(zz+zz);
00453 z = PIO4 - zz;
00454 zz = zz * p - MOREBITS;
00455 z -= zz;
00456 z += PIO4;
00457
00458 if( a < 0.625 ){
00459 zz = a * a;
00460 px = PX1asin;
00461 px*= zz;
00462 px+= PX2asin;
00463 px*= zz;
00464 px+= PX3asin;
00465 px*= zz;
00466 px+= PX4asin;
00467 px*= zz;
00468 px+= PX5asin;
00469 px*= zz;
00470 px+= PX6asin;
00471
00472 qx = zz;
00473 qx+= QX1asin;
00474 qx*= zz;
00475 qx+= QX2asin;
00476 qx*= zz;
00477 qx+= QX3asin;
00478 qx*= zz;
00479 qx+= QX4asin;
00480 qx*= zz;
00481 qx+= QX5asin;
00482
00483 z = zz*px/qx;
00484
00485 z = a * z + a;
00486 }
00487
00488 z *= sign;
00489
00490
00491 if( a < 1.0e-8 )
00492 z = a;
00493
00494 return z;
00495 }
00496
00497
00498
00499
00501 VDT_FORCE_INLINE double fast_cos(double x){
00502
00503 x = std::abs(x);
00504
00505 if( x > PI )
00506 x = TWOPI - x ;
00507
00508 int sign = 1;
00509 if( x > PIO2 ){
00510 x = PI - x;
00511 sign=-1;
00512 }
00513
00514 double y = int( x/PIO4 );
00515
00516 int j=0;
00517 if (x>PIO4){
00518 j=2;
00519 y+=1;
00520 sign = -sign;
00521 }
00522
00523
00524 double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
00525
00526 double zz = z * z;
00527
00528 double px=0;
00529 if( j==2 ){
00530 px = C1sin;
00531 px *= zz;
00532 px += C2sin;
00533 px *= zz;
00534 px += C3sin;
00535 px *= zz;
00536 px += C4sin;
00537 px *= zz;
00538 px += C5sin;
00539 px *= zz;
00540 px += C6sin;
00541 y = z + z * zz * px;
00542 }
00543 else{
00544 px = C1cos;
00545 px *= zz;
00546 px += C2cos;
00547 px *= zz;
00548 px += C3cos;
00549 px *= zz;
00550 px += C4cos;
00551 px *= zz;
00552 px += C5cos;
00553 px *= zz;
00554 px += C6cos;
00555 y = 1. - zz * .5 + zz * zz * px;
00556 }
00557
00558 y *= sign;
00559
00560 return y;
00561 }
00562
00563
00564 VDT_FORCE_INLINE double fast_acos(double x){
00565 double z;
00566
00567
00568
00569
00570
00571
00572 z = 2.0 * fast_asin( sqrt(0.5 - 0.5 * x ) ) ;
00573
00574 return z;
00575 }
00576
00577
00579 VDT_FORCE_INLINE double fast_tan( double x ){
00580
00581
00582
00583
00584
00585
00586
00587
00588 double abs_x =std::abs(x);
00589 int sign = x/abs_x;
00590 x = abs_x;
00591
00592
00593
00594
00595
00596 int nPI = x /PI;
00597 x = x - nPI * PI;
00598
00599
00600
00601
00602
00603
00604
00605
00606 int nPIO2 = x/PIO2;
00607 int factor = ( 1 - 2* nPIO2);
00608 x = nPIO2* PI + factor * x;
00609 sign *= factor;
00610
00611
00612
00613 int nPIO4 = x/PIO4;
00614 double y = 2 * nPIO4;
00615
00616
00617
00618
00619
00620
00621
00622
00623
00624
00625
00626 double z = ((x - y * DP1tan) - y * DP2tan) - y * DP3tan;
00627
00628 double zz = z * z;
00629
00630 y=z;
00631
00632 if( zz > 1.0e-14 ){
00633 double px = PX1tan;
00634 px *= zz;
00635 px += PX2tan;
00636 px *= zz;
00637 px += PX3tan;
00638
00639 double qx=zz;
00640 qx += QX1tan;
00641 qx *=zz;
00642 qx += QX2tan;
00643 qx *=zz;
00644 qx += QX3tan;
00645 qx *=zz;
00646 qx += QX4tan;
00647
00648 y = z + z * zz * px / qx;
00649 }
00650
00651
00652
00653
00654
00655 y -= nPIO4 * ( y + 1.0 / y);
00656
00657 y *= sign;
00658
00659 return y ;
00660 }
00661
00662
00663
00664 VDT_FORCE_INLINE double fast_atan(double x){
00665
00666
00667 int sign = 1;
00668 if( x < 0.0 ) {
00669 x = - x;
00670 sign = -1;
00671 }
00672
00673
00674 double originalx=x;
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691 double y = PIO4;
00692 double factor = MOREBITSO2;
00693 x = (x-1.0) / (x+1.0);
00694
00695 if( originalx > T3PO8 ) {
00696 y = PIO2;
00697
00698 factor = MOREBITS;
00699 x = -1.0 / originalx ;
00700 }
00701 if ( originalx <= 0.66 ) {
00702 y = 0.0;
00703 x = originalx;
00704
00705 factor = 0.;
00706 }
00707
00708 double z = x * x;
00709
00710 double px = PX1atan;
00711 px *= z;
00712 px += PX2atan;
00713 px *= z;
00714 px += PX3atan;
00715 px *= z;
00716 px += PX4atan;
00717 px *= z;
00718 px += PX5atan;
00719 px *= z;
00720
00721 double qx=z;
00722 qx += QX1atan;
00723 qx *=z;
00724 qx += QX2atan;
00725 qx *=z;
00726 qx += QX3atan;
00727 qx *=z;
00728 qx += QX4atan;
00729 qx *=z;
00730 qx += QX5atan;
00731
00732
00733
00734
00735 y = y +x * px / qx + x +factor;
00736
00737 y = sign * y;
00738
00739 return y;
00740 }
00741
00742
00743
00744
00745
00746 VDT_FORCE_INLINE double fast_isqrt_general(double x, const unsigned short ISQRT_ITERATIONS) {
00747
00748 double x2 = x * 0.5;
00749 double y = x;
00750 unsigned long long i = d2ll(y);
00751
00752 i = 0x5fe6eb50c7aa19f9 - ( i >> 1 );
00753 y = ll2d(i);
00754 for (unsigned int j=0;j<ISQRT_ITERATIONS;++j)
00755 y *= 1.5 - ( x2 * y * y ) ;
00756
00757 return y;
00758 }
00759
00760
00761
00762
00763
00764
00765 VDT_FORCE_INLINE double fast_isqrt(double x) {return fast_isqrt_general(x,4);}
00766
00767
00768 VDT_FORCE_INLINE double fast_approx_isqrt(double x) {return fast_isqrt_general(x,3);}
00769
00770
00771
00772 VDT_FORCE_INLINE double std_isqrt (double x) {return 1./std::sqrt(x);}
00773
00774
00775
00776 VDT_FORCE_INLINE double fast_inv (double x) {
00777 double sign = 1;
00778 if( x < 0.0 ) {
00779 x = - x;
00780 sign = -1;
00781 }
00782 double y=fast_isqrt(x);
00783 return y*y*sign;
00784 }
00785
00786 VDT_FORCE_INLINE double fast_approx_inv (double x) {
00787 double sign = 1;
00788 if( x < 0.0 ) {
00789 x = - x;
00790 sign = -1;
00791 }
00792 double y=fast_approx_isqrt(x);
00793 return y*y*sign;}
00794
00795 VDT_FORCE_INLINE double std_inv (double x) {return 1./x;}
00796
00797
00798
00799
00800
00801
00802 #define FAST_VECT_FUNC(NAME) __attribute__((always_inline)) inline void NAME##_vect(double const * VDT_RESTRICT input , double * VDT_RESTRICT outupt, const unsigned int arr_size) { \
00803 for (unsigned int i=0;i<arr_size;++i) \
00804 outupt[i] = NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
00805 }
00806
00808 void fast_exp_vect_46(double const* input, double* output, const unsigned int arr_size);
00809
00810
00811 FAST_VECT_FUNC(fast_exp)
00812
00813
00814 void fast_log_vect_46(double const* input, double* output, const unsigned int arr_size);
00815
00816
00817 FAST_VECT_FUNC(fast_log)
00818
00819
00820 FAST_VECT_FUNC(fast_sin)
00821
00822
00823 FAST_VECT_FUNC(fast_asin)
00824
00825
00826 FAST_VECT_FUNC(fast_cos)
00827
00828
00829 FAST_VECT_FUNC(fast_acos)
00830
00831
00832 FAST_VECT_FUNC(fast_tan)
00833
00834
00835 FAST_VECT_FUNC(fast_atan)
00836
00837
00838 FAST_VECT_FUNC(fast_isqrt)
00839
00840
00841 FAST_VECT_FUNC(fast_approx_isqrt)
00842
00843
00844 FAST_VECT_FUNC(fast_inv)
00845
00846
00847 FAST_VECT_FUNC(fast_approx_inv)
00848
00849
00850
00851 #define VECT_FUNC(NAME) __attribute__((always_inline)) inline void std_##NAME##_vect(double const * VDT_RESTRICT input , double* VDT_RESTRICT outupt, const unsigned int arr_size) { \
00852 for (unsigned int i=0;i<arr_size;++i) \
00853 outupt[i] = std::NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
00854 }
00855
00856 VECT_FUNC(exp)
00857
00858 VECT_FUNC(log)
00859
00860 VECT_FUNC(sin)
00861
00862 VECT_FUNC(asin)
00863
00864 VECT_FUNC(cos)
00865
00866 VECT_FUNC(acos)
00867
00868 VECT_FUNC(tan)
00869
00870 VECT_FUNC(atan)
00871
00872 VDT_FORCE_INLINE void std_isqrt_vect(double const * VDT_RESTRICT input ,
00873 double* VDT_RESTRICT output,
00874 const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
00875
00876 for (unsigned int i=0;i<arr_size;++i)
00877 output[i] = vdt::std_isqrt(input[i]);
00878 }
00879
00880 VDT_FORCE_INLINE void std_inv_vect(double const * VDT_RESTRICT input ,
00881 double* VDT_RESTRICT output,
00882 const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
00883
00884 for (unsigned int i=0;i<arr_size;++i)
00885 output[i] = vdt::std_inv(input[i]);
00886 }
00887
00888
00889
00890
00891 }
00892
00893 #endif
00894