CMS 3D CMS Logo

libminifloat.h
Go to the documentation of this file.
1 #ifndef libminifloat_h
2 #define libminifloat_h
4 #include <cstdint>
5 #include <cassert>
6 #include <algorithm>
7 
8 // ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
10  public:
12  inline static float float16to32(uint16_t h) {
13  union { float flt; uint32_t i32; } conv;
14  conv.i32 = mantissatable[offsettable[h>>10]+(h&0x3ff)]+exponenttable[h>>10];
15  return conv.flt;
16  }
17  inline static uint16_t float32to16(float x) {
18  return float32to16round(x);
19  }
21  inline static uint16_t float32to16crop(float x) {
22  union { float flt; uint32_t i32; } conv;
23  conv.flt = x;
24  return basetable[(conv.i32>>23)&0x1ff]+((conv.i32&0x007fffff)>>shifttable[(conv.i32>>23)&0x1ff]);
25  }
27  inline static uint16_t float32to16round(float x) {
28  union { float flt; uint32_t i32; } conv;
29  conv.flt = x;
30  uint8_t shift = shifttable[(conv.i32>>23)&0x1ff];
31  if (shift == 13) {
32  uint16_t base2 = (conv.i32&0x007fffff)>>12;
33  uint16_t base = base2 >> 1;
34  if (((base2 & 1) != 0) && (base < 1023)) base++;
35  return basetable[(conv.i32>>23)&0x1ff]+base;
36  } else {
37  return basetable[(conv.i32>>23)&0x1ff]+((conv.i32&0x007fffff)>>shifttable[(conv.i32>>23)&0x1ff]);
38  }
39  }
40  template<int bits>
41  inline static float reduceMantissaToNbits(const float &f)
42  {
43  static_assert(bits <= 23,"max mantissa size is 23 bits");
44  constexpr uint32_t mask = (0xFFFFFFFF >> (23-bits)) << (23-bits);
45  union { float flt; uint32_t i32; } conv;
46  conv.flt=f;
47  conv.i32&=mask;
48  return conv.flt;
49  }
50  inline static float reduceMantissaToNbits(const float &f, int bits)
51  {
52  uint32_t mask = (0xFFFFFFFF >> (23-bits)) << (23-bits);
53  union { float flt; uint32_t i32; } conv;
54  conv.flt=f;
55  conv.i32&=mask;
56  return conv.flt;
57  }
58 
60  public:
62  shift(23-bits), mask((0xFFFFFFFF >> (shift)) << (shift)),
63  test(1 << (shift-1)), maxn((1<<bits)-2) {
64  assert(bits <= 23); // "max mantissa size is 23 bits"
65  }
66  float operator()(float f) const {
67  constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
68  constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
69  union { float flt; uint32_t i32; } conv;
70  conv.flt=f;
71  if (conv.i32 & test) { // need to round
72  uint32_t mantissa = (conv.i32 & low23) >> shift;
73  if (mantissa < maxn) mantissa++;
74  conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
75  } else {
76  conv.i32 &= mask;
77  }
78  return conv.flt;
79  }
80  private:
81  const int shift;
82  const uint32_t mask, test, maxn;
83  };
84 
85  template<int bits>
86  inline static float reduceMantissaToNbitsRounding(const float &f)
87  {
88  static const ReduceMantissaToNbitsRounding reducer(bits);
89  return reducer(f);
90  }
91 
92 
93 
94  inline static float reduceMantissaToNbitsRounding(float f, int bits)
95  {
96  return ReduceMantissaToNbitsRounding(bits)(f);
97  }
98 
99  template<typename InItr, typename OutItr>
100  static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out)
101  {
102  std::transform(begin, end, out, ReduceMantissaToNbitsRounding(bits));
103  }
104 
105 
106  inline static float max() {
107  union { float flt; uint32_t i32; } conv;
108  conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
109  return conv.flt;
110  }
111 
112  // Maximum float32 value that gets rounded to max()
113  inline static float max32RoundedToMax16() {
114  union { float flt; uint32_t i32; } conv;
115  // 2^16 in float32 is the first to result inf in float16, so
116  // 2^16-1 is the last float32 to result max() in float16
117  conv.i32 = (0x8f<<23) - 1;
118  return conv.flt;
119  }
120 
121  inline static float min() {
122  union { float flt; uint32_t i32; } conv;
123  conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
124  return conv.flt;
125  }
126 
127  // Minimum float32 value that gets rounded to min()
128  inline static float min32RoundedToMin16() {
129  union { float flt; uint32_t i32; } conv;
130  // 2^-14-1 in float32 is the first to result denormalized in float16, so
131  // 2^-14 is the first float32 to result min() in float16
132  conv.i32 = (0x71<<23);
133  return conv.flt;
134  }
135 
136  inline static float denorm_min() {
137  union { float flt; uint32_t i32; } conv;
138  conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
139  return conv.flt;
140  }
141 
142  inline static bool isdenorm(uint16_t h) {
143  // if exponent is zero (sign-bit excluded of course) and mantissa is not zero
144  return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
145  }
146 
147  private:
148  CMS_THREAD_SAFE static uint32_t mantissatable[2048];
149  CMS_THREAD_SAFE static uint32_t exponenttable[64];
150  CMS_THREAD_SAFE static uint16_t offsettable[64];
151  CMS_THREAD_SAFE static uint16_t basetable[512];
152  CMS_THREAD_SAFE static uint8_t shifttable[512];
153  static void filltables() ;
154 };
155 #endif
static uint16_t float32to16crop(float x)
Fast implementation, but it crops the number so it biases low.
Definition: libminifloat.h:21
static float min32RoundedToMin16()
Definition: libminifloat.h:128
FWCore Framework interface EventSetupRecordImplementation h
Helper function to determine trigger accepts.
static uint16_t offsettable[64]
Definition: libminifloat.h:150
static HepMC::IO_HEPEVT conv
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision bits
static float float16to32(uint16_t h)
Definition: libminifloat.h:12
static uint16_t basetable[512]
Definition: libminifloat.h:151
#define constexpr
static float denorm_min()
Definition: libminifloat.h:136
static uint8_t shifttable[512]
Definition: libminifloat.h:152
static uint16_t float32to16(float x)
Definition: libminifloat.h:17
static uint32_t mantissatable[2048]
Definition: libminifloat.h:148
static float reduceMantissaToNbits(const float &f, int bits)
Definition: libminifloat.h:50
double f[11][100]
#define CMS_THREAD_SAFE
#define end
Definition: vmac.h:39
base
Make Sure CMSSW is Setup ##.
static float min()
Definition: libminifloat.h:121
static float reduceMantissaToNbitsRounding(float f, int bits)
Definition: libminifloat.h:94
static bool isdenorm(uint16_t h)
Definition: libminifloat.h:142
static float reduceMantissaToNbitsRounding(const float &f)
Definition: libminifloat.h:86
#define begin
Definition: vmac.h:32
static float max32RoundedToMax16()
Definition: libminifloat.h:113
static unsigned int const shift
static float max()
Definition: libminifloat.h:106
static void filltables()
Definition: libminifloat.cc:18
static uint16_t float32to16round(float x)
Slower implementation, but it rounds to avoid biases.
Definition: libminifloat.h:27
static float reduceMantissaToNbits(const float &f)
Definition: libminifloat.h:41
static uint32_t exponenttable[64]
Definition: libminifloat.h:149
static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out)
Definition: libminifloat.h:100