CMS 3D CMS Logo

approx_exp.h
Go to the documentation of this file.
1 #ifndef DataFormatsMathAPPROX_EXP_H
2 #define DataFormatsMathAPPROX_EXP_H
3 /* Quick and not that dirty vectorizable exp implementations
4  Author: Florent de Dinechin, Aric, ENS-Lyon
5  with advice from Vincenzo Innocente, CERN
6  All right reserved
7 
8 Warning + disclaimers:
9 
10 Feel free to distribute or insert in other programs etc, as long as this notice is attached.
11  Comments, requests etc: Florent.de.Dinechin@ens-lyon.fr
12 
13 Polynomials were obtained using Sollya scripts (in comments):
14 please also keep these comments attached to the code.
15 
16 If a derivative of this code ends up in the glibc I am too happy: the version with MANAGE_SUBNORMALS=1 and DEGREE=6 is faithful-accurate over the full 2^32 binary32 numbers and behaves well WRT exceptional cases. It is about 4 times faster than the stock expf on this PC, when compiled with gcc -O2.
17 
18 This code is FMA-safe (i.e. accelerated and more accurate with an FMA) as long as my parentheses are respected.
19 
20 A remaining TODO is to try and manage the over/underflow using only integer tests as per Astasiev et al, RNC conf.
21 Not sure it makes that much sense in the vector context.
22 
23 */
24 
25 // #define MANAGE_SUBNORMALS 1 // No measurable perf difference, so let's be clean.
26 // If set to zero we flush to zero the subnormal outputs, ie for x<-88 or so
27 
28 // DEGREE
29 // 6 is perfect.
30 // 5 provides max 2-ulp error,
31 // 4 loses 44 ulps (6 bits) for an acceleration of 10% WRT 6
32 // (I don't subtract the loop and call overhead, so it would be more for constexprd code)
33 
34 // see the comments in the code for the accuracy you get from a given degree
35 
37 
38 template <int DEGREE>
39 constexpr float approx_expf_P(float p);
40 
41 // degree = 2 => absolute accuracy is 8 bits
42 template <>
43 constexpr float approx_expf_P<2>(float y) {
44  return float(0x2.p0) + y * (float(0x2.07b99p0) + y * float(0x1.025b84p0));
45 }
46 // degree = 3 => absolute accuracy is 12 bits
47 template <>
48 constexpr float approx_expf_P<3>(float y) {
49 #ifdef HORNER // HORNER
50  return float(0x2.p0) + y * (float(0x1.fff798p0) + y * (float(0x1.02249p0) + y * float(0x5.62042p-4)));
51 #else // ESTRIN
52  float p23 = (float(0x1.02249p0) + y * float(0x5.62042p-4));
53  float p01 = float(0x2.p0) + y * float(0x1.fff798p0);
54  return p01 + y * y * p23;
55 #endif
56 }
57 // degree = 4 => absolute accuracy is 17 bits
58 template <>
59 constexpr float approx_expf_P<4>(float y) {
60  return float(0x2.p0) +
61  y * (float(0x1.fffb1p0) + y * (float(0xf.ffe84p-4) + y * (float(0x5.5f9c1p-4) + y * float(0x1.57755p-4))));
62 }
63 // degree = 5 => absolute accuracy is 22 bits
64 template <>
65 constexpr float approx_expf_P<5>(float y) {
66  return float(0x2.p0) +
67  y * (float(0x2.p0) + y * (float(0xf.ffed8p-4) +
68  y * (float(0x5.5551cp-4) + y * (float(0x1.5740d8p-4) + y * float(0x4.49368p-8)))));
69 }
70 // degree = 6 => absolute accuracy is 27 bits
71 template <>
72 constexpr float approx_expf_P<6>(float y) {
73 #ifdef HORNER // HORNER
74  float p =
75  float(0x2.p0) +
76  y * (float(0x2.p0) +
77  y * (float(0x1.p0) + y * (float(0x5.55523p-4) + y * (float(0x1.5554dcp-4) +
78  y * (float(0x4.48f41p-8) + y * float(0xb.6ad4p-12))))));
79 #else // ESTRIN does seem to save a cycle or two
80  float p56 = float(0x4.48f41p-8) + y * float(0xb.6ad4p-12);
81  float p34 = float(0x5.55523p-4) + y * float(0x1.5554dcp-4);
82  float y2 = y * y;
83  float p12 = float(0x2.p0) + y; // By chance we save one operation here! Funny.
84  float p36 = p34 + y2 * p56;
85  float p16 = p12 + y2 * p36;
86  float p = float(0x2.p0) + y * p16;
87 #endif
88  return p;
89 }
90 
91 // degree = 7 => absolute accuracy is 31 bits
92 template <>
93 constexpr float approx_expf_P<7>(float y) {
94  return float(0x2.p0) +
95  y * (float(0x2.p0) +
96  y * (float(0x1.p0) +
97  y * (float(0x5.55555p-4) +
98  y * (float(0x1.5554e4p-4) +
99  y * (float(0x4.444adp-8) + y * (float(0xb.6a8a6p-12) + y * float(0x1.9ec814p-12)))))));
100 }
101 
102 /* The Sollya script that computes the polynomials above
103 
104 
105 f= 2*exp(y);
106 I=[-log(2)/2;log(2)/2];
107 filename="/tmp/polynomials";
108 print("") > filename;
109 for deg from 2 to 8 do begin
110  p = fpminimax(f, deg,[|1,23...|],I, floating, absolute);
111  display=decimal;
112  acc=floor(-log2(sup(supnorm(p, f, I, absolute, 2^(-40)))));
113  print( " // degree = ", deg,
114  " => absolute accuracy is ", acc, "bits" ) >> filename;
115  print("#if ( DEGREE ==", deg, ")") >> filename;
116  display=hexadecimal;
117  print(" float p = ", horner(p) , ";") >> filename;
118  print("#endif") >> filename;
119 end;
120 
121 */
122 
123 // valid for -87.3365 < x < 88.7228
124 template <int DEGREE>
125 constexpr float unsafe_expf_impl(float x) {
126  using namespace approx_math;
127  /* Sollya for the following constants:
128  display=hexadecimal;
129  1b23+1b22;
130  single(1/log(2));
131  log2H=round(log(2), 16, RN);
132  log2L = single(log(2)-log2H);
133  log2H; log2L;
134 
135  */
136  // constexpr float rnd_cst = float(0xc.p20);
137  constexpr float inv_log2f = float(0x1.715476p0);
138  constexpr float log2H = float(0xb.172p-4);
139  constexpr float log2L = float(0x1.7f7d1cp-20);
140 
141  float y = x;
142  // This is doing round(x*inv_log2f) to the nearest integer
143  float z = fpfloor((x * inv_log2f) + 0.5f);
144  // Cody-and-Waite accurate range reduction. FMA-safe.
145  y -= z * log2H;
146  y -= z * log2L;
147  // exponent
148  int32_t e = z;
149 
150  // we want RN above because it centers the interval around zero
151  // but then we could have 2^e = below being infinity when it shouldn't
152  // (when e=128 but p<1)
153  // so we avoid this case by reducing e and evaluating a polynomial for 2*exp
154  e -= 1;
155 
156  // NaN inputs will propagate to the output as expected
157 
158  float p = approx_expf_P<DEGREE>(y);
159 
160  // cout << "x=" << x << " e=" << e << " y=" << y << " p=" << p <<"\n";
161  binary32 ef;
162  uint32_t biased_exponent = e + 127;
163  ef.ui32 = (biased_exponent << 23);
164 
165  return p * ef.f;
166 }
167 
168 #ifndef NO_APPROX_MATH
169 
170 template <int DEGREE>
171 constexpr float unsafe_expf(float x) {
172  return unsafe_expf_impl<DEGREE>(x);
173 }
174 
175 template <int DEGREE>
176 constexpr float approx_expf(float x) {
177  constexpr float inf_threshold = float(0x5.8b90cp4);
178  // log of the smallest normal
179  constexpr float zero_threshold_ftz = -float(0x5.75628p4); // sollya: single(log(1b-126));
180  // flush to zero on the output
181  // manage infty output:
182  // faster than directly on output!
183  x = std::min(std::max(x, zero_threshold_ftz), inf_threshold);
184  float r = unsafe_expf<DEGREE>(x);
185 
186  return r;
187 }
188 
189 #else
190 template <int DEGREE>
191 constexpr float unsafe_expf(float x) {
192  return std::exp(x);
193 }
194 template <int DEGREE>
195 constexpr float approx_expf(float x) {
196  return std::exp(x);
197 }
198 #endif // NO_APPROX_MATH
199 
200 #endif
approx_expf_P
constexpr float approx_expf_P(float p)
dqmMemoryStats.float
float
Definition: dqmMemoryStats.py:127
approx_expf
constexpr float approx_expf(float x)
Definition: approx_exp.h:176
approx_math::binary32::ui32
uint32_t ui32
Definition: approx_math.h:18
approx_expf_P< 2 >
constexpr float approx_expf_P< 2 >(float y)
Definition: approx_exp.h:43
approx_math::binary32
Definition: approx_math.h:12
f
double f[11][100]
Definition: MuScleFitUtils.cc:78
min
T min(T a, T b)
Definition: MathUtil.h:58
testProducerWithPsetDescEmpty_cfi.x2
x2
Definition: testProducerWithPsetDescEmpty_cfi.py:28
AlCaHLTBitMon_ParallelJobs.p
p
Definition: AlCaHLTBitMon_ParallelJobs.py:153
approx_expf_P< 6 >
constexpr float approx_expf_P< 6 >(float y)
Definition: approx_exp.h:72
approx_expf_P< 4 >
constexpr float approx_expf_P< 4 >(float y)
Definition: approx_exp.h:59
testProducerWithPsetDescEmpty_cfi.x1
x1
Definition: testProducerWithPsetDescEmpty_cfi.py:33
vertices_cff.x
x
Definition: vertices_cff.py:29
approx_math::binary32::f
float f
Definition: approx_math.h:20
testProducerWithPsetDescEmpty_cfi.y2
y2
Definition: testProducerWithPsetDescEmpty_cfi.py:30
unsafe_expf
constexpr float unsafe_expf(float x)
Definition: approx_exp.h:171
approx_expf_P< 7 >
constexpr float approx_expf_P< 7 >(float y)
Definition: approx_exp.h:93
SiStripPI::max
Definition: SiStripPayloadInspectorHelper.h:169
approx_expf_P< 3 >
constexpr float approx_expf_P< 3 >(float y)
Definition: approx_exp.h:48
approx_math.h
approx_expf_P< 5 >
constexpr float approx_expf_P< 5 >(float y)
Definition: approx_exp.h:65
approx_math::fpfloor
constexpr float fpfloor(float x)
Definition: approx_math.h:25
unsafe_expf_impl
constexpr float unsafe_expf_impl(float x)
Definition: approx_exp.h:125
alignCSCRings.r
r
Definition: alignCSCRings.py:93
approx_math
Definition: approx_math.h:9
detailsBasic3DVector::y
float float y
Definition: extBasic3DVector.h:14
JetChargeProducer_cfi.exp
exp
Definition: JetChargeProducer_cfi.py:6
MillePedeFileConverter_cfg.e
e
Definition: MillePedeFileConverter_cfg.py:37