CMS 3D CMS Logo

/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_5_3_14/src/DataFormats/Math/interface/VDTMath.h

Go to the documentation of this file.
00001 #ifndef vdt_math_h
00002 #define vdt_math_h
00003 
00004 /*******************************************************************************
00005  *
00006  * VDT math library: collection of double precision vectorisable trascendental
00007  * functions.
00008  * The c++11 standard is used: remember to enable it for the compilation.
00009  *
00010  * The basic idea is to exploit pade polinomials.
00011  * A lot of ideas were inspired by the cephes math library (by Stephen L. Moshier
00012  * moshier@na-net.ornl.gov) as well as actual code for the exp, log, sin, cos, 
00013  * tan, asin, acos and atan functions. The Cephes library can be found here:
00014  * http://www.netlib.org/cephes/
00015  * 
00016  ******************************************************************************/
00017 
00018 
00019 #include <iostream>
00020 #include <cmath>
00021 #include <limits>
00022 
00023 // used to enable some compiler option internally if needed
00024 #define CMS_VECTORIZE
00025 //#define CMS_VECTORIZE_VERBOSE __attribute__ ((optimize("-ftree-vectorizer-verbose=7")))
00026 #define CMS_VECTORIZE_VERBOSE
00027 //#define VDT_RESTRICT __restrict__
00028 #define VDT_RESTRICT
00029 
00030 #define VDT_FORCE_INLINE __attribute__((always_inline)) inline 
00031 
00032 namespace vdt {
00033 
00034 // paramters
00035 constexpr double LOG2E = 1.4426950408889634073599; // 1/log(2)
00036 constexpr double SQRTH = 0.70710678118654752440;
00037 
00038 /*
00039 Pade' polinomials coefficients and constants needed throughout the code.
00040 The result of constexpr *here* is likely to be the same of const static
00041 nevertheless we show its power: all the computations will take place at compile 
00042 time
00043 */
00044 
00045 // exp
00046 constexpr double EXP_LIMIT = 708.;
00047 constexpr double PX1exp = 1.26177193074810590878E-4;
00048 constexpr double PX2exp = 3.02994407707441961300E-2;
00049 constexpr double PX3exp = 9.99999999999999999910E-1;
00050 constexpr double QX1exp = 3.00198505138664455042E-6;
00051 constexpr double QX2exp = 2.52448340349684104192E-3;
00052 constexpr double QX3exp = 2.27265548208155028766E-1;
00053 constexpr double QX4exp = 2.00000000000000000009E0;
00054 
00055 // logarithm
00056 constexpr double LOG_UPPER_LIMIT = 1e307;
00057 constexpr double LOG_LOWER_LIMIT = 1e-307;
00058 constexpr double PX1log = 1.01875663804580931796E-4;
00059 constexpr double PX2log = 4.97494994976747001425E-1;
00060 constexpr double PX3log = 4.70579119878881725854E0;
00061 constexpr double PX4log = 1.44989225341610930846E1;
00062 constexpr double PX5log = 1.79368678507819816313E1;
00063 constexpr double PX6log = 7.70838733755885391666E0;
00064 
00065 constexpr double QX1log = 1.12873587189167450590E1;
00066 constexpr double QX2log = 4.52279145837532221105E1;
00067 constexpr double QX3log = 8.29875266912776603211E1;
00068 constexpr double QX4log = 7.11544750618563894466E1;
00069 constexpr double QX5log = 2.31251620126765340583E1;
00070 
00071 // Sin and cos
00072 constexpr double DP1sc = 7.85398125648498535156E-1;
00073 constexpr double DP2sc = 3.77489470793079817668E-8;
00074 constexpr double DP3sc = 2.69515142907905952645E-15;
00075 constexpr double TWOPI = 2.*M_PI;
00076 constexpr double PI = M_PI;
00077 constexpr double PIO2 = M_PI_2;
00078 constexpr double PIO4 = M_PI_4;
00079 
00080 // Sin
00081 constexpr double SIN_UPPER_LIMIT = TWOPI;
00082 constexpr double SIN_LOWER_LIMIT = -SIN_UPPER_LIMIT;
00083 constexpr double C1sin = 1.58962301576546568060E-10;
00084 constexpr double C2sin =-2.50507477628578072866E-8;
00085 constexpr double C3sin = 2.75573136213857245213E-6;
00086 constexpr double C4sin =-1.98412698295895385996E-4;
00087 constexpr double C5sin = 8.33333333332211858878E-3;
00088 constexpr double C6sin =-1.66666666666666307295E-1;
00089 
00090 //Cos
00091 constexpr double C1cos =-1.13585365213876817300E-11;
00092 constexpr double C2cos = 2.08757008419747316778E-9;
00093 constexpr double C3cos =-2.75573141792967388112E-7;
00094 constexpr double C4cos = 2.48015872888517045348E-5;
00095 constexpr double C5cos =-1.38888888888730564116E-3;
00096 constexpr double C6cos = 4.16666666666665929218E-2;
00097 
00098 // Asin and acos
00099 
00100 constexpr double RX1asin = 2.967721961301243206100E-3;
00101 constexpr double RX2asin = -5.634242780008963776856E-1;
00102 constexpr double RX3asin = 6.968710824104713396794E0;
00103 constexpr double RX4asin = -2.556901049652824852289E1;
00104 constexpr double RX5asin = 2.853665548261061424989E1;
00105 
00106 constexpr double SX1asin = -2.194779531642920639778E1;
00107 constexpr double SX2asin =  1.470656354026814941758E2;
00108 constexpr double SX3asin = -3.838770957603691357202E2;
00109 constexpr double SX4asin = 3.424398657913078477438E2;
00110 
00111 constexpr double PX1asin = 4.253011369004428248960E-3;
00112 constexpr double PX2asin = -6.019598008014123785661E-1;
00113 constexpr double PX3asin = 5.444622390564711410273E0;
00114 constexpr double PX4asin = -1.626247967210700244449E1;
00115 constexpr double PX5asin = 1.956261983317594739197E1;
00116 constexpr double PX6asin = -8.198089802484824371615E0;
00117 
00118 constexpr double QX1asin = -1.474091372988853791896E1;
00119 constexpr double QX2asin =  7.049610280856842141659E1;
00120 constexpr double QX3asin = -1.471791292232726029859E2;
00121 constexpr double QX4asin = 1.395105614657485689735E2;
00122 constexpr double QX5asin = -4.918853881490881290097E1;
00123 
00124 //Tan
00125 constexpr double PX1tan=-1.30936939181383777646E4;
00126 constexpr double PX2tan=1.15351664838587416140E6;
00127 constexpr double PX3tan=-1.79565251976484877988E7;
00128 
00129 constexpr double QX1tan = 1.36812963470692954678E4;
00130 constexpr double QX2tan = -1.32089234440210967447E6;
00131 constexpr double QX3tan = 2.50083801823357915839E7;
00132 constexpr double QX4tan = -5.38695755929454629881E7;
00133 
00134 constexpr double DP1tan = 7.853981554508209228515625E-1;
00135 constexpr double DP2tan = 7.94662735614792836714E-9;
00136 constexpr double DP3tan = 3.06161699786838294307E-17;
00137 constexpr double TAN_LIMIT = TWOPI;
00138 
00139 // Atan
00140 constexpr double T3PO8 = 2.41421356237309504880;
00141 constexpr double MOREBITS = 6.123233995736765886130E-17;
00142 constexpr double MOREBITSO2 = MOREBITS/2.;
00143 
00144 constexpr double PX1atan = -8.750608600031904122785E-1;
00145 constexpr double PX2atan = -1.615753718733365076637E1;
00146 constexpr double PX3atan = -7.500855792314704667340E1;
00147 constexpr double PX4atan = -1.228866684490136173410E2;
00148 constexpr double PX5atan = -6.485021904942025371773E1;
00149 
00150 constexpr double QX1atan = - 2.485846490142306297962E1;
00151 constexpr double QX2atan = 1.650270098316988542046E2;
00152 constexpr double QX3atan = 4.328810604912902668951E2;
00153 constexpr double QX4atan = 4.853903996359136964868E2;
00154 constexpr double QX5atan = 1.945506571482613964425E2;
00155 
00156 constexpr double ATAN_LIMIT = 1e307;
00157 
00158 // Inverse Sqrt
00159 // constexpr unsigned int ISQRT_ITERATIONS = 4;
00160 constexpr double SQRT_LIMIT = 1e307;
00161 
00162 // Service----------------------------------------------------------------------
00164 void print_instructions_info();
00165 
00166 //------------------------------------------------------------------------------
00167 
00169 typedef union {
00170   double d;
00171   int i[2];
00172   long long ll;
00173   unsigned short s[4];
00174 } ieee754;
00175 
00176 //------------------------------------------------------------------------------
00177 
00179 VDT_FORCE_INLINE double ll2d(unsigned long long x) {
00180   ieee754 tmp;
00181   tmp.ll=x;
00182   return tmp.d;
00183 }
00184 
00185 //------------------------------------------------------------------------------
00186 
00188 VDT_FORCE_INLINE unsigned long long d2ll(double x) {
00189   ieee754 tmp;
00190   tmp.d=x;
00191   return tmp.ll;
00192 }
00193 
00194 //------------------------------------------------------------------------------
00196 VDT_FORCE_INLINE double getMantExponent(double x, double& fe){
00197   
00198   unsigned long long n = d2ll(x);
00199   
00200   // shift to the right up to the beginning of the exponent 
00201   // then with a mask, cut off the sign bit
00202   unsigned long long le = ((n >> 52) & 0x7ffLL);
00203   
00204   // chop the head of the number: an int contains more than 11 bits (32)
00205   int e = le; // This is important since sums on ull do not vectorise
00206   fe = (e-1023) +1 ; // the plus one to make the result identical to frexp
00207   
00208   // 13 times f means 52 1. Masking with this means putting to 0 exponent
00209   // and sign of a double, leaving the Mantissa, the first 52 bits of a double.
00210   n &=0xfffffffffffffLL;
00211   
00212   // build a mask which is 0.5, i.e. an exponent equal to 1022
00213   // which means *2, see the above +1.
00214   const unsigned long long p05 = d2ll(0.5);
00215   n |= p05;
00216   x = ll2d(n);
00217   return x;
00218 }
00219 
00220 //------------------------------------------------------------------------------
00221 // Now the mathematical functions are encoded.
00222 
00223 // Exp -------------------------------------------------------------------------
00224 // Vectorises in a loop without any change in 4.7
00226 VDT_FORCE_INLINE double fast_exp(double x){
00227 
00228     double initial_x = x;
00229 
00230 //    double px =int(LOG2E * x + 0.5); // std::floor(LOG2E * x + 0.5);
00231     double px = std::floor(LOG2E * x + 0.5);
00232 
00233     int n = px;
00234 
00235     x -= px * 6.93145751953125E-1;
00236     x -= px * 1.42860682030941723212E-6;
00237 
00238     double xx = x * x;
00239 
00240     // px = x * P(x**2).
00241     px = PX1exp;
00242     px *= xx;
00243     px += PX2exp;
00244     px *= xx;
00245     px += PX3exp;
00246     px *= x;
00247 
00248     // Evaluate Q(x**2).
00249     double qx = QX1exp;
00250     qx *= xx;
00251     qx += QX2exp;
00252     qx *= xx;
00253     qx += QX3exp;
00254     qx *= xx;
00255     qx += QX4exp;
00256 
00257     // e**x = 1 + 2x P(x**2)/( Q(x**2) - P(x**2) )
00258     x = px / (qx - px);
00259     x = 1.0 + 2.0 * x;
00260 
00261     // Build 2^n in double.
00262     ieee754 u;
00263     u.d = 0;
00264     n += 1023;
00265     u.ll = (long long) (n) << 52;
00266 
00267     double res = x * u.d;
00268     if (initial_x > EXP_LIMIT)
00269             res = std::numeric_limits<double>::infinity();
00270     if (initial_x < -EXP_LIMIT)
00271             res = 0.;   
00272     
00273     return res;
00274 }
00275 
00276 
00277 // Log -------------------------------------------------------------------------
00278 
00279 VDT_FORCE_INLINE double fast_log(double x){
00280 
00281     double input_x=x;
00282 
00283     /* separate mantissa from exponent */    
00284     double fe;
00285     x = getMantExponent(x,fe);
00286 
00287     // blending      
00288     if( x < SQRTH ) {
00289       fe-=1;
00290       x +=  x ;
00291       }
00292     x -= 1.0;
00293 
00294     /* rational form */
00295 
00296     double z = x*x;
00297     double px =  PX1log;
00298     px *= x;    
00299     px += PX2log;
00300     px *= x;    
00301     px += PX3log;
00302     px *= x; 
00303     px += PX4log;
00304     px *= x; 
00305     px += PX5log;
00306     px *= x;
00307     px += PX6log;
00308     //
00309     //for the final formula
00310     px *= x; 
00311     px *= z;
00312 
00313     double qx = x;
00314     qx += QX1log;
00315     qx *=x;
00316     qx += QX2log;
00317     qx *=x;    
00318     qx += QX3log;
00319     qx *=x;    
00320     qx += QX4log;
00321     qx *=x;    
00322     qx += QX5log;
00323 
00324     double y = px / qx ;
00325 
00326     y -= fe * 2.121944400546905827679e-4; 
00327     y -= 0.5 * z  ;
00328 
00329     z = x + y;
00330     z += fe * 0.693359375;
00331 
00332     if (input_x > LOG_UPPER_LIMIT)
00333       z = std::numeric_limits<double>::infinity();
00334     if (input_x < LOG_LOWER_LIMIT)
00335       z =  - std::numeric_limits<double>::infinity();       
00336 
00337 //     std::cout << input_x << " " << std::log(input_x) << " " << z << std::endl;
00338 
00339     return( z );  
00340   
00341   }
00342 
00343 //------------------------------------------------------------------------------
00345 VDT_FORCE_INLINE double fast_sin(double x){
00346 
00347   int sign = 1;
00348 
00349   if (x < 0){
00350     x = - x;
00351     sign = -1;
00352     }
00353 
00354   if( x > PI ){
00355     x = TWOPI - x;
00356     sign = - sign;
00357     }
00358 
00359   if( x > PIO2 )
00360     x = PI - x ;
00361 
00362   double y = int( x/PIO4 ); // integer part of x/PIO4 
00363 
00364   int j=0;
00365   if (x>PIO4){
00366      j=2;
00367      y+=1;
00368      }
00369 
00370   /* Extended precision modular arithmetic */
00371   double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
00372 
00373   double zz = z * z;
00374 
00375   double px=0;
00376 
00377   if( j==2 ){
00378     px  = C1cos;
00379     px *= zz;
00380     px += C2cos;
00381     px *= zz;
00382     px += C3cos;
00383     px *= zz;
00384     px += C4cos;
00385     px *= zz;
00386     px += C5cos;
00387     px *= zz;
00388     px += C6cos;
00389     y = 1.0 - zz * .5 + zz * zz * px;
00390     }
00391   else{
00392     px  = C1sin;
00393     px *= zz;
00394     px += C2sin;
00395     px *= zz;
00396     px += C3sin;
00397     px *= zz;
00398     px += C4sin;
00399     px *= zz;
00400     px += C5sin;
00401     px *= zz;
00402     px += C6sin;
00403     y = z  +  z * zz * px;
00404     }
00405 
00406   y *= sign;
00407 
00408   return y;
00409   }
00410 
00411 //------------------------------------------------------------------------------
00412 
00413 VDT_FORCE_INLINE double fast_asin(double x){
00414 
00415   int sign=1;
00416   double a = x; //necessary for linear approx
00417 
00418   if ( x < 0. ){
00419     sign *= -1;
00420     a *= -1;
00421     }
00422 
00423   double p, z, zz;
00424   double px,qx;
00425 
00426   /* arcsin(1-x) = pi/2 - sqrt(2x)(1+R(x))  */
00427   zz = 1.0 - a;
00428   
00429   px = RX1asin;
00430   px*= zz;
00431   px+= RX2asin;
00432   px*= zz;    
00433   px+= RX3asin;
00434   px*= zz;    
00435   px+= RX4asin;
00436   px*= zz;    
00437   px+= RX5asin;
00438 
00439   qx = zz;
00440   qx+= SX1asin;
00441   qx*= zz;    
00442   qx+= SX2asin;
00443   qx*= zz;    
00444   qx+= SX3asin;
00445   qx*= zz;    
00446   qx+= SX4asin;
00447   
00448   p =zz* px/qx;
00449   
00450 //     p = zz * polevl( zz, R, 4)/p1evl( zz, S, 4);
00451   
00452   zz = std::sqrt(zz+zz);
00453   z = PIO4 - zz;
00454   zz = zz * p - MOREBITS;
00455   z -= zz;
00456   z += PIO4;
00457     
00458   if( a < 0.625 ){
00459     zz = a * a;
00460     px = PX1asin;
00461     px*= zz;
00462     px+= PX2asin;
00463     px*= zz;    
00464     px+= PX3asin;
00465     px*= zz;    
00466     px+= PX4asin;
00467     px*= zz;    
00468     px+= PX5asin;
00469     px*= zz;    
00470     px+= PX6asin;
00471 
00472     qx = zz;
00473     qx+= QX1asin;
00474     qx*= zz;    
00475     qx+= QX2asin;
00476     qx*= zz;    
00477     qx+= QX3asin;
00478     qx*= zz;    
00479     qx+= QX4asin;
00480     qx*= zz;    
00481     qx+= QX5asin;
00482     
00483     z = zz*px/qx;    
00484 
00485     z = a * z + a;
00486     }
00487 
00488   z *= sign;
00489 
00490    //linear approx, not sooo needed but seable. Price is cheap though
00491   if( a < 1.0e-8 )
00492     z = a;
00493   
00494   return z;
00495   }
00496 
00497 //------------------------------------------------------------------------------
00498 
00499     
00501 VDT_FORCE_INLINE double fast_cos(double x){
00502 
00503   x = std::abs(x);
00504 
00505   if( x > PI )
00506     x = TWOPI - x ;
00507 
00508   int sign = 1;
00509    if( x > PIO2 ){
00510      x = PI - x;
00511      sign=-1;
00512      }
00513 
00514   double y = int( x/PIO4 ); // integer part of x/PIO4 
00515 
00516   int j=0;
00517   if (x>PIO4){
00518     j=2;
00519     y+=1;
00520     sign = -sign;
00521     }
00522 
00523   /* Extended precision modular arithmetic */
00524   double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
00525 
00526   double zz = z * z;
00527 
00528   double px=0;
00529   if( j==2 ){
00530     px  = C1sin;
00531     px *= zz;
00532     px += C2sin;
00533     px *= zz;
00534     px += C3sin;
00535     px *= zz;
00536     px += C4sin;
00537     px *= zz;
00538     px += C5sin;
00539     px *= zz;
00540     px += C6sin;
00541     y = z  +  z * zz * px;
00542     }
00543   else{
00544     px  = C1cos;
00545     px *= zz;
00546     px += C2cos;
00547     px *= zz;
00548     px += C3cos;
00549     px *= zz;
00550     px += C4cos;
00551     px *= zz;
00552     px += C5cos;
00553     px *= zz;
00554     px += C6cos;  
00555     y = 1. - zz * .5 + zz * zz * px;  
00556     }
00557   
00558   y *= sign;
00559   
00560   return y;
00561   }
00562 
00563 // Acos ------------------------------------------------------------------------
00564 VDT_FORCE_INLINE double fast_acos(double x){
00565   double z;
00566  
00567 //   z = PIO4 - fast_asin(x);
00568 //   z += MOREBITS;
00569 //   z += PIO4;
00570 
00571 //   if (x > .5 )
00572     z = 2.0 * fast_asin(  sqrt(0.5 - 0.5 * x ) ) ;
00573     
00574   return z;
00575   }
00576 
00577 // Tangent  --------------------------------------------------------------------
00579 VDT_FORCE_INLINE double fast_tan( double x ){
00580 /* DP
00581  * Some of the ifs had to be skipped and replaced by calculations. This allowed 
00582  * the vectorisation but introduced a loss of performance. 
00583  * A solution should be found
00584 */  
00585   
00586    // make argument positive but save the sign
00587    // without ifs
00588    double abs_x =std::abs(x);
00589    int sign = x/abs_x;
00590    x = abs_x;
00591 
00592 // remove this if
00593 //     if (x > PI)
00594 //        x = x - PI;
00595 // like this:
00596    int nPI  = x /PI;
00597    x = x - nPI * PI;
00598 
00599 
00600 //     reflect and flip with if
00601 //     if (x > PIO2){
00602 //        x = PI - x ;
00603 //        sign = - sign;
00604 //       }
00605 // and without
00606     int nPIO2 = x/PIO2;
00607     int factor = ( 1 - 2* nPIO2);
00608     x = nPIO2* PI + factor * x;
00609     sign *= factor;
00610 
00611 
00612     /* compute x mod PIO4 */
00613     int nPIO4 = x/PIO4;
00614     double y = 2 * nPIO4;
00615 
00616     /* integer and fractional part modulo one octant */
00617 
00618 //     This if can be removed and the expression becomes
00619 //     if (x > PIO4){
00620 //        y=2.0;
00621 //       }
00622 //   like this:
00623 //     y = y  + nPIO4;
00624 
00625 
00626     double z = ((x - y * DP1tan) - y * DP2tan) - y * DP3tan;
00627 
00628     double zz = z * z;
00629 
00630     y=z;
00631 
00632     if( zz > 1.0e-14 ){
00633         double px = PX1tan;
00634         px *= zz;
00635         px += PX2tan;
00636         px *= zz;
00637         px += PX3tan;
00638 
00639         double qx=zz;
00640         qx += QX1tan;
00641         qx *=zz;
00642         qx += QX2tan;
00643         qx *=zz;
00644         qx += QX3tan;
00645         qx *=zz;
00646         qx += QX4tan;
00647 
00648         y = z + z * zz * px / qx;
00649     }
00650 
00651     // here if we are in the second octant we have
00652     //  y = -1 /y
00653    // else we have y!
00654    // again a trick not to use ifs...
00655     y -= nPIO4 * (  y  + 1.0 / y);
00656 
00657     y *= sign;
00658 
00659     return y ;
00660     }
00661 
00662 // Atan -------------------------------------------------------------------------
00663 // REMEMBER pi/2 == inf!!
00664 VDT_FORCE_INLINE double fast_atan(double x){
00665 
00666     /* make argument positive and save the sign */
00667     int sign = 1;
00668     if( x < 0.0 ) {
00669         x = - x;
00670         sign = -1;
00671         }
00672 
00673     /* range reduction */
00674     double originalx=x;
00675 
00676 // This is slower!
00677 //     double y = 0.0;
00678 //     double factor = 0.;
00679 // 
00680 //     if (x  > .66){
00681 //         y = PIO4;
00682 //         factor = MOREBITSO2;
00683 //         x = (x-1.0) / (x+1.0);
00684 //         }
00685 //     if( originalx > T3PO8 ) {
00686 //         y = PIO2;
00687 //         factor = MOREBITS;
00688 //         x = -1.0 / originalx ;
00689 //         }
00690 
00691     double y = PIO4;
00692     double factor = MOREBITSO2;
00693     x = (x-1.0) / (x+1.0);
00694 
00695     if( originalx > T3PO8 ) {
00696         y = PIO2;
00697         //flag = 1.;
00698         factor = MOREBITS;
00699         x = -1.0 / originalx ;
00700         }
00701     if ( originalx <= 0.66 ) {
00702         y = 0.0;
00703         x = originalx;
00704         //flag = 0.;
00705         factor = 0.;
00706         }
00707 
00708     double z = x * x;
00709 
00710     double px = PX1atan;
00711     px *= z;
00712     px += PX2atan;
00713     px *= z;
00714     px += PX3atan;
00715     px *= z;
00716     px += PX4atan;
00717     px *= z;
00718     px += PX5atan;
00719     px *= z; // for the final formula
00720 
00721     double qx=z;
00722     qx += QX1atan;
00723     qx *=z;
00724     qx += QX2atan;
00725     qx *=z;
00726     qx += QX3atan;
00727     qx *=z;
00728     qx += QX4atan;
00729     qx *=z;
00730     qx += QX5atan;
00731 
00732 //     z = px / qx;
00733 //     z = x * px / qx + x;
00734 
00735     y = y +x * px / qx + x +factor;
00736 
00737     y = sign * y;
00738 
00739     return y;
00740     }
00741 
00742 //------------------------------------------------------------------------------
00743 
00744 // Taken from from quake and remixed :-)
00745 
00746 VDT_FORCE_INLINE double fast_isqrt_general(double x, const unsigned short ISQRT_ITERATIONS) { 
00747 
00748   double x2 = x * 0.5;
00749   double y  = x;
00750   unsigned long long i  = d2ll(y);
00751   // Evil!
00752   i  = 0x5fe6eb50c7aa19f9  - ( i >> 1 );
00753   y  = ll2d(i);
00754   for (unsigned int j=0;j<ISQRT_ITERATIONS;++j)
00755       y *= 1.5 - ( x2 * y * y ) ;
00756 
00757   return y;
00758 }
00759 
00760 
00761 
00762 //------------------------------------------------------------------------------
00763 
00764 // Four iterations
00765 VDT_FORCE_INLINE double fast_isqrt(double x) {return fast_isqrt_general(x,4);} 
00766 
00767 // Two iterations
00768 VDT_FORCE_INLINE double fast_approx_isqrt(double x) {return fast_isqrt_general(x,3);}
00769 
00770 //------------------------------------------------------------------------------
00771 
00772 VDT_FORCE_INLINE double std_isqrt (double x) {return 1./std::sqrt(x);}
00773 
00774 //------------------------------------------------------------------------------
00775 
00776 VDT_FORCE_INLINE double fast_inv (double x) {
00777     double sign = 1;
00778     if( x < 0.0 ) {
00779         x = - x;
00780         sign = -1;
00781         }
00782     double y=fast_isqrt(x); 
00783     return y*y*sign;
00784     }
00785 
00786 VDT_FORCE_INLINE double fast_approx_inv (double x) {
00787     double sign = 1;
00788     if( x < 0.0 ) {
00789         x = - x;
00790         sign = -1;
00791         }
00792     double y=fast_approx_isqrt(x); 
00793     return y*y*sign;}
00794 
00795 VDT_FORCE_INLINE double std_inv (double x) {return 1./x;}
00796 
00797 //------------------------------------------------------------------------------
00798 // Some preprocessor in order to avoid a lot of error prone repetitions
00799 // CMS_VECTORIZE_VERBOSE is a preprocessor variable in a preprocessor function
00800 
00801 // Fast vector functions 
00802 #define FAST_VECT_FUNC(NAME) __attribute__((always_inline)) inline void NAME##_vect(double const * VDT_RESTRICT input , double * VDT_RESTRICT outupt, const unsigned int arr_size) { \
00803   for (unsigned int i=0;i<arr_size;++i) \
00804     outupt[i] = NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
00805     }
00806 
00808 void fast_exp_vect_46(double const* input, double* output, const unsigned int arr_size);
00809 
00810 // Profitability threshold = 3
00811 FAST_VECT_FUNC(fast_exp)
00812 
00813 
00814 void fast_log_vect_46(double const* input, double* output, const unsigned int arr_size);
00815 
00816 // Profitability threshold = 3
00817 FAST_VECT_FUNC(fast_log)
00818 
00819 // Profitability threshold = 7
00820 FAST_VECT_FUNC(fast_sin)
00821 
00822 // Profitability threshold = 3
00823 FAST_VECT_FUNC(fast_asin)
00824 
00825 // Profitability threshold = 7
00826 FAST_VECT_FUNC(fast_cos)
00827 
00828 // Profitability threshold = 3
00829 FAST_VECT_FUNC(fast_acos)
00830 
00831 //Profitability threshold = 3
00832 FAST_VECT_FUNC(fast_tan)
00833 
00834 //Profitability threshold = 3
00835 FAST_VECT_FUNC(fast_atan)
00836   
00837 //Profitability threshold = 2 (2!!!)
00838 FAST_VECT_FUNC(fast_isqrt)
00839 
00840 //Profitability threshold = 2 (2!!!)
00841 FAST_VECT_FUNC(fast_approx_isqrt)
00842 
00843 //Profitability threshold = 2
00844 FAST_VECT_FUNC(fast_inv)
00845 
00846 //Profitability threshold = 2
00847 FAST_VECT_FUNC(fast_approx_inv)
00848 
00849 //------------------------------------------------------------------------------
00850 // Reference vector functions
00851 #define VECT_FUNC(NAME) __attribute__((always_inline)) inline void std_##NAME##_vect(double const * VDT_RESTRICT input , double* VDT_RESTRICT outupt, const unsigned int arr_size) { \
00852   for (unsigned int i=0;i<arr_size;++i) \
00853     outupt[i] = std::NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
00854     }
00855 
00856 VECT_FUNC(exp)
00857 
00858 VECT_FUNC(log)
00859 
00860 VECT_FUNC(sin)
00861 
00862 VECT_FUNC(asin)
00863 
00864 VECT_FUNC(cos)
00865 
00866 VECT_FUNC(acos)
00867 
00868 VECT_FUNC(tan)
00869 
00870 VECT_FUNC(atan)
00871 
00872 VDT_FORCE_INLINE void std_isqrt_vect(double const * VDT_RESTRICT input , 
00873                     double* VDT_RESTRICT output, 
00874                     const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
00875   //Profitability threshold = 6
00876   for (unsigned int i=0;i<arr_size;++i)
00877     output[i] = vdt::std_isqrt(input[i]);
00878   }
00879 
00880 VDT_FORCE_INLINE void std_inv_vect(double const * VDT_RESTRICT input , 
00881                     double* VDT_RESTRICT output, 
00882                     const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
00883   //Profitability threshold = 6
00884   for (unsigned int i=0;i<arr_size;++i)
00885     output[i] = vdt::std_inv(input[i]);
00886   }
00887 
00888 
00889 //------------------------------------------------------------------------------
00890 
00891 } // end of vdt namespace
00892 
00893 #endif
00894