CMS 3D CMS Logo

libminifloat.h
Go to the documentation of this file.
1 #ifndef libminifloat_h
2 #define libminifloat_h
4 #include <cstdint>
5 
6 // ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
8  public:
10  inline static float float16to32(uint16_t h) {
11  union { float flt; uint32_t i32; } conv;
12  conv.i32 = mantissatable[offsettable[h>>10]+(h&0x3ff)]+exponenttable[h>>10];
13  return conv.flt;
14  }
15  inline static uint16_t float32to16(float x) {
16  return float32to16round(x);
17  }
19  inline static uint16_t float32to16crop(float x) {
20  union { float flt; uint32_t i32; } conv;
21  conv.flt = x;
22  return basetable[(conv.i32>>23)&0x1ff]+((conv.i32&0x007fffff)>>shifttable[(conv.i32>>23)&0x1ff]);
23  }
25  inline static uint16_t float32to16round(float x) {
26  union { float flt; uint32_t i32; } conv;
27  conv.flt = x;
28  uint8_t shift = shifttable[(conv.i32>>23)&0x1ff];
29  if (shift == 13) {
30  uint16_t base2 = (conv.i32&0x007fffff)>>12;
31  uint16_t base = base2 >> 1;
32  if (((base2 & 1) != 0) && (base < 1023)) base++;
33  return basetable[(conv.i32>>23)&0x1ff]+base;
34  } else {
35  return basetable[(conv.i32>>23)&0x1ff]+((conv.i32&0x007fffff)>>shifttable[(conv.i32>>23)&0x1ff]);
36  }
37  }
38  template<int bits>
39  inline static float reduceMantissaToNbits(const float &f)
40  {
41  static_assert(bits <= 23,"max mantissa size is 23 bits");
42  constexpr uint32_t mask = (0xFFFFFFFF >> (23-bits)) << (23-bits);
43  union { float flt; uint32_t i32; } conv;
44  conv.flt=f;
45  conv.i32&=mask;
46  return conv.flt;
47  }
48  inline static float reduceMantissaToNbits(const float &f, int bits)
49  {
50  uint32_t mask = (0xFFFFFFFF >> (23-bits)) << (23-bits);
51  union { float flt; uint32_t i32; } conv;
52  conv.flt=f;
53  conv.i32&=mask;
54  return conv.flt;
55  }
56 
57  template<int bits>
58  inline static float reduceMantissaToNbitsRounding(const float &f)
59  {
60  static_assert(bits <= 23,"max mantissa size is 23 bits");
61  constexpr int shift = (23-bits); // bits I throw away
62  constexpr uint32_t mask = (0xFFFFFFFF >> (shift)) << (shift); // mask for truncation
63  constexpr uint32_t test = 1 << (shift-1); // most significant bit I throw away
64  constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
65  constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
66  constexpr uint32_t maxn = (1<<bits)-2; // max number I can increase before overflowing
67  union { float flt; uint32_t i32; } conv;
68  conv.flt=f;
69  if (conv.i32 & test) { // need to round
70  uint32_t mantissa = (conv.i32 & low23) >> shift;
71  if (mantissa < maxn) mantissa++;
72  conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
73  } else {
74  conv.i32 &= mask;
75  }
76  return conv.flt;
77  }
78 
79  inline static float max() {
80  union { float flt; uint32_t i32; } conv;
81  conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
82  return conv.flt;
83  }
84 
85  // Maximum float32 value that gets rounded to max()
86  inline static float max32RoundedToMax16() {
87  union { float flt; uint32_t i32; } conv;
88  // 2^16 in float32 is the first to result inf in float16, so
89  // 2^16-1 is the last float32 to result max() in float16
90  conv.i32 = (0x8f<<23) - 1;
91  return conv.flt;
92  }
93 
94  inline static float min() {
95  union { float flt; uint32_t i32; } conv;
96  conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
97  return conv.flt;
98  }
99 
100  // Minimum float32 value that gets rounded to min()
101  inline static float min32RoundedToMin16() {
102  union { float flt; uint32_t i32; } conv;
103  // 2^-14-1 in float32 is the first to result denormalized in float16, so
104  // 2^-14 is the first float32 to result min() in float16
105  conv.i32 = (0x71<<23);
106  return conv.flt;
107  }
108 
109  inline static float denorm_min() {
110  union { float flt; uint32_t i32; } conv;
111  conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
112  return conv.flt;
113  }
114 
115  inline static bool isdenorm(uint16_t h) {
116  // if exponent is zero (sign-bit excluded of course) and mantissa is not zero
117  return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
118  }
119 
120  private:
121  CMS_THREAD_SAFE static uint32_t mantissatable[2048];
122  CMS_THREAD_SAFE static uint32_t exponenttable[64];
123  CMS_THREAD_SAFE static uint16_t offsettable[64];
124  CMS_THREAD_SAFE static uint16_t basetable[512];
125  CMS_THREAD_SAFE static uint8_t shifttable[512];
126  static void filltables() ;
127 };
128 #endif
static uint16_t float32to16crop(float x)
Fast implementation, but it crops the number so it biases low.
Definition: libminifloat.h:19
static float min32RoundedToMin16()
Definition: libminifloat.h:101
static uint16_t offsettable[64]
Definition: libminifloat.h:123
static HepMC::IO_HEPEVT conv
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision bits
static float float16to32(uint16_t h)
Definition: libminifloat.h:10
static uint16_t basetable[512]
Definition: libminifloat.h:124
#define constexpr
static float denorm_min()
Definition: libminifloat.h:109
static uint8_t shifttable[512]
Definition: libminifloat.h:125
static uint16_t float32to16(float x)
Definition: libminifloat.h:15
static uint32_t mantissatable[2048]
Definition: libminifloat.h:121
static float reduceMantissaToNbits(const float &f, int bits)
Definition: libminifloat.h:48
double f[11][100]
#define CMS_THREAD_SAFE
base
Make Sure CMSSW is Setup ##.
static float min()
Definition: libminifloat.h:94
static bool isdenorm(uint16_t h)
Definition: libminifloat.h:115
static float reduceMantissaToNbitsRounding(const float &f)
Definition: libminifloat.h:58
static float max32RoundedToMax16()
Definition: libminifloat.h:86
static unsigned int const shift
static float max()
Definition: libminifloat.h:79
static void filltables()
Definition: libminifloat.cc:18
static uint16_t float32to16round(float x)
Slower implementation, but it rounds to avoid biases.
Definition: libminifloat.h:25
static float reduceMantissaToNbits(const float &f)
Definition: libminifloat.h:39
static uint32_t exponenttable[64]
Definition: libminifloat.h:122