CMS 3D CMS Logo

/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_6_2_7/src/DataFormats/Math/interface/approx_exp.h

Go to the documentation of this file.
00001 #ifndef APPROX_EXP_H
00002 #define APPROX_EXP_H
00003 /*  Quick and not that dirty vectorizable exp implementations
00004     Author: Florent de Dinechin, Aric, ENS-Lyon 
00005     with advice from Vincenzo Innocente, CERN
00006     All right reserved
00007 
00008 Warning + disclaimers:
00009  
00010 Feel free to distribute or insert in other programs etc, as long as this notice is attached.
00011     Comments, requests etc: Florent.de.Dinechin@ens-lyon.fr
00012 
00013 Polynomials were obtained using Sollya scripts (in comments): 
00014 please also keep these comments attached to the code. 
00015 
00016 If a derivative of this code ends up in the glibc I am too happy: the version with MANAGE_SUBNORMALS=1 and DEGREE=6 is faithful-accurate over the full 2^32 binary32 numbers and behaves well WRT exceptional cases. It is about 4 times faster than the stock expf on this PC, when compiled with gcc -O2.
00017 
00018 This code is FMA-safe (i.e. accelerated and more accurate with an FMA) as long as my parentheses are respected. 
00019 
00020 A remaining TODO is to try and manage the over/underflow using only integer tests as per Astasiev et al, RNC conf.
00021 Not sure it makes that much sense in the vector context.
00022 
00023 */
00024 
00025 // #define MANAGE_SUBNORMALS 1 // No measurable perf difference, so let's be clean.
00026 // If set to zero we flush to zero the subnormal outputs, ie for x<-88 or so
00027 
00028 // DEGREE 
00029 // 6 is perfect. 
00030 // 5 provides max 2-ulp error, 
00031 // 4 loses 44 ulps (6 bits) for an acceleration of 10% WRT 6
00032 // (I don't subtract the loop and call overhead, so it would be more for inlined code)
00033 
00034 // see the comments in the code for the accuracy you get from a given degree
00035 
00036 
00037 #include <cstdint>
00038 #include <cmath>
00039 #include <limits>
00040 #include <algorithm>
00041 
00042 #ifndef APPROX_MATH_N
00043 #define APPROX_MATH_N
00044 namespace approx_math {
00045   union binary32 {
00046     binary32() : ui32(0) {};
00047     binary32(float ff) : f(ff) {};
00048     binary32(int32_t ii) : i32(ii){}
00049     binary32(uint32_t ui) : ui32(ui){}
00050     
00051     uint32_t ui32; /* unsigned int */                
00052     int32_t i32; /* Signed int */                
00053     float f;
00054   };
00055 
00056 #ifdef __SSE4_1__
00057   inline float fpfloor(float x) {
00058     return std::floor(x);
00059   }
00060 #else
00061   inline float fpfloor(float x) {
00062     int32_t ret = x;
00063     binary32 xx(x);
00064     ret-=(xx.ui32>>31);  
00065     return ret;
00066   }
00067 #endif
00068 
00069 }
00070 #endif
00071 
00072 
00073 template<int DEGREE>
00074 inline float approx_expf_P(float p);
00075 
00076 // degree =  2   => absolute accuracy is  8 bits
00077 template<>
00078 inline float approx_expf_P<2>(float y) {
00079   return   float(0x2.p0) + y * (float(0x2.07b99p0) + y * float(0x1.025b84p0)) ;
00080 }
00081 // degree =  3   => absolute accuracy is  12 bits
00082 template<>
00083 inline float approx_expf_P<3>(float y) {
00084 #ifdef HORNER  // HORNER 
00085   return   float(0x2.p0) + y * (float(0x1.fff798p0) + y * (float(0x1.02249p0) + y * float(0x5.62042p-4))) ;
00086 #else // ESTRIN
00087   float p23 = (float(0x1.02249p0) + y * float(0x5.62042p-4)) ;
00088   float p01 = float(0x2.p0) + y * float(0x1.fff798p0);
00089   return p01 + y*y*p23;
00090 #endif
00091 }
00092 // degree =  4   => absolute accuracy is  17 bits
00093 template<>
00094 inline float approx_expf_P<4>(float y) {
00095   return   float(0x2.p0) + y * (float(0x1.fffb1p0) + y * (float(0xf.ffe84p-4) + y * (float(0x5.5f9c1p-4) + y * float(0x1.57755p-4)))) ;
00096 }
00097 // degree =  5   => absolute accuracy is  22 bits
00098 template<>
00099 inline float approx_expf_P<5>(float y) {
00100   return   float(0x2.p0) + y * (float(0x2.p0) + y * (float(0xf.ffed8p-4) + y * (float(0x5.5551cp-4) + y * (float(0x1.5740d8p-4) + y * float(0x4.49368p-8))))) ;
00101 }
00102 // degree =  6   => absolute accuracy is  27 bits
00103 template<>
00104 inline float approx_expf_P<6>(float y) {
00105 #ifdef HORNER  // HORNER 
00106   float p =  float(0x2.p0) + y * (float(0x2.p0) + y * (float(0x1.p0) + y * (float(0x5.55523p-4) + y * (float(0x1.5554dcp-4) + y * (float(0x4.48f41p-8) + y * float(0xb.6ad4p-12)))))) ;
00107 #else // ESTRIN does seem to save a cycle or two
00108   float p56 = float(0x4.48f41p-8) + y * float(0xb.6ad4p-12);
00109   float p34 = float(0x5.55523p-4) + y * float(0x1.5554dcp-4);
00110   float y2 = y*y;
00111   float p12 = float(0x2.p0) + y; // By chance we save one operation here! Funny.
00112   float p36 = p34 + y2*p56;
00113   float p16 = p12 + y2*p36;
00114   float p =  float(0x2.p0) + y*p16;
00115 #endif
00116   return p;
00117 }
00118 
00119 // degree =  7   => absolute accuracy is  31 bits
00120 template<>
00121 inline float approx_expf_P<7>(float y) {
00122    return float(0x2.p0) + y * (float(0x2.p0) + y * (float(0x1.p0) + y * (float(0x5.55555p-4) + y * (float(0x1.5554e4p-4) + y * (float(0x4.444adp-8) + y * (float(0xb.6a8a6p-12) + y * float(0x1.9ec814p-12))))))) ;
00123 }
00124 
00125 /* The Sollya script that computes the polynomials above
00126 
00127 
00128 f= 2*exp(y);
00129 I=[-log(2)/2;log(2)/2];
00130 filename="/tmp/polynomials";
00131 print("") > filename;
00132 for deg from 2 to 8 do begin
00133   p = fpminimax(f, deg,[|1,23...|],I, floating, absolute); 
00134   display=decimal;
00135   acc=floor(-log2(sup(supnorm(p, f, I, absolute, 2^(-40)))));
00136   print( "   // degree = ", deg, 
00137          "  => absolute accuracy is ",  acc, "bits" ) >> filename;
00138   print("#if ( DEGREE ==", deg, ")") >> filename;
00139   display=hexadecimal;
00140   print("   float p = ", horner(p) , ";") >> filename;
00141   print("#endif") >> filename;
00142 end;
00143 
00144 */
00145 
00146 
00147 // valid for -87.3365 < x < 88.7228
00148 template<int DEGREE>
00149 inline float unsafe_expf_impl(float x) {
00150   using namespace approx_math;
00151   /* Sollya for the following constants:
00152      display=hexadecimal;
00153      1b23+1b22;
00154      single(1/log(2));
00155      log2H=round(log(2), 16, RN);
00156      log2L = single(log(2)-log2H);
00157      log2H; log2L;
00158      
00159   */
00160   // constexpr float rnd_cst = float(0xc.p20);
00161   constexpr float inv_log2f = float(0x1.715476p0);
00162   constexpr float log2H = float(0xb.172p-4);
00163   constexpr float log2L = float(0x1.7f7d1cp-20);
00164   
00165    
00166   float y = x;
00167   // This is doing round(x*inv_log2f) to the nearest integer
00168   float z = fpfloor((x*inv_log2f) +0.5f);
00169   // Cody-and-Waite accurate range reduction. FMA-safe.
00170   y -= z*log2H;
00171   y -= z*log2L;
00172   // exponent 
00173   int32_t e = z;
00174   
00175 
00176   // we want RN above because it centers the interval around zero
00177   // but then we could have 2^e = below being infinity when it shouldn't 
00178   // (when e=128 but p<1)
00179   // so we avoid this case by reducing e and evaluating a polynomial for 2*exp
00180   e -=1; 
00181 
00182   // NaN inputs will propagate to the output as expected
00183 
00184   float p = approx_expf_P<DEGREE>(y);
00185 
00186   // cout << "x=" << x << "  e=" << e << "  y=" << y << "  p=" << p <<"\n";
00187   binary32 ef;
00188   uint32_t biased_exponent= e+127;
00189   ef.ui32=(biased_exponent<<23);
00190   
00191   return p * ef.f;
00192 }
00193 
00194 
00195 #ifndef NO_APPROX_MATH
00196 
00197 template<int DEGREE>
00198 inline float unsafe_expf(float x) {
00199  return  unsafe_expf_impl<DEGREE>(x); 
00200 }
00201 
00202 template<int DEGREE>
00203 inline float approx_expf(float x) {
00204 
00205   constexpr float inf_threshold =float(0x5.8b90cp4);
00206   // log of the smallest normal
00207   constexpr float zero_threshold_ftz =-float(0x5.75628p4); // sollya: single(log(1b-126));
00208   // flush to zero on the output
00209   // manage infty output:
00210   // faster than directly on output!
00211   x = std::min(std::max(x,zero_threshold_ftz),inf_threshold);
00212   float r = unsafe_expf<DEGREE>(x); 
00213 
00214    return r;
00215 }
00216 
00217 
00218 #else
00219 template<int DEGREE>
00220 inline float unsafe_expf(float x) {
00221   return std::exp(x);
00222 }
00223 template<int DEGREE>
00224 inline float approx_expf(float x) {
00225   return std::exp(x);
00226 }
00227 #endif  // NO_APPROX_MATH
00228 
00229 
00230 
00231 #endif