CMS 3D CMS Logo

libminifloat.h
Go to the documentation of this file.
1 #ifndef libminifloat_h
2 #define libminifloat_h
4 #include <cstdint>
5 #include <cassert>
6 #include <algorithm>
7 
8 // ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
10 public:
12  inline static float float16to32(uint16_t h) {
13  union {
14  float flt;
15  uint32_t i32;
16  } conv;
17  conv.i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
18  return conv.flt;
19  }
20  inline static uint16_t float32to16(float x) { return float32to16round(x); }
22  inline static uint16_t float32to16crop(float x) {
23  union {
24  float flt;
25  uint32_t i32;
26  } conv;
27  conv.flt = x;
28  return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
29  }
31  inline static uint16_t float32to16round(float x) {
32  union {
33  float flt;
34  uint32_t i32;
35  } conv;
36  conv.flt = x;
37  uint8_t shift = shifttable[(conv.i32 >> 23) & 0x1ff];
38  if (shift == 13) {
39  uint16_t base2 = (conv.i32 & 0x007fffff) >> 12;
40  uint16_t base = base2 >> 1;
41  if (((base2 & 1) != 0) && (base < 1023))
42  base++;
43  return basetable[(conv.i32 >> 23) & 0x1ff] + base;
44  } else {
45  return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
46  }
47  }
48  template <int bits>
49  inline static float reduceMantissaToNbits(const float &f) {
50  static_assert(bits <= 23, "max mantissa size is 23 bits");
51  constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
52  union {
53  float flt;
54  uint32_t i32;
55  } conv;
56  conv.flt = f;
57  conv.i32 &= mask;
58  return conv.flt;
59  }
60  inline static float reduceMantissaToNbits(const float &f, int bits) {
61  uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
62  union {
63  float flt;
64  uint32_t i32;
65  } conv;
66  conv.flt = f;
67  conv.i32 &= mask;
68  return conv.flt;
69  }
70 
72  public:
74  : shift(23 - bits), mask((0xFFFFFFFF >> (shift)) << (shift)), test(1 << (shift - 1)), maxn((1 << bits) - 2) {
75  assert(bits <= 23); // "max mantissa size is 23 bits"
76  }
77  float operator()(float f) const {
78  constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
79  constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
80  union {
81  float flt;
82  uint32_t i32;
83  } conv;
84  conv.flt = f;
85  if (conv.i32 & test) { // need to round
86  uint32_t mantissa = (conv.i32 & low23) >> shift;
87  if (mantissa < maxn)
88  mantissa++;
89  conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
90  } else {
91  conv.i32 &= mask;
92  }
93  return conv.flt;
94  }
95 
96  private:
97  const int shift;
98  const uint32_t mask, test, maxn;
99  };
100 
101  template <int bits>
102  inline static float reduceMantissaToNbitsRounding(const float &f) {
103  static const ReduceMantissaToNbitsRounding reducer(bits);
104  return reducer(f);
105  }
106 
107  inline static float reduceMantissaToNbitsRounding(float f, int bits) {
109  }
110 
111  template <typename InItr, typename OutItr>
112  static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out) {
114  }
115 
116  inline static float max() {
117  union {
118  float flt;
119  uint32_t i32;
120  } conv;
121  conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
122  return conv.flt;
123  }
124 
125  // Maximum float32 value that gets rounded to max()
126  inline static float max32RoundedToMax16() {
127  union {
128  float flt;
129  uint32_t i32;
130  } conv;
131  // 2^16 in float32 is the first to result inf in float16, so
132  // 2^16-1 is the last float32 to result max() in float16
133  conv.i32 = (0x8f << 23) - 1;
134  return conv.flt;
135  }
136 
137  inline static float min() {
138  union {
139  float flt;
140  uint32_t i32;
141  } conv;
142  conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
143  return conv.flt;
144  }
145 
146  // Minimum float32 value that gets rounded to min()
147  inline static float min32RoundedToMin16() {
148  union {
149  float flt;
150  uint32_t i32;
151  } conv;
152  // 2^-14-1 in float32 is the first to result denormalized in float16, so
153  // 2^-14 is the first float32 to result min() in float16
154  conv.i32 = (0x71 << 23);
155  return conv.flt;
156  }
157 
158  inline static float denorm_min() {
159  union {
160  float flt;
161  uint32_t i32;
162  } conv;
163  conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
164  return conv.flt;
165  }
166 
167  inline static bool isdenorm(uint16_t h) {
168  // if exponent is zero (sign-bit excluded of course) and mantissa is not zero
169  return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
170  }
171 
172 private:
173  CMS_THREAD_SAFE static uint32_t mantissatable[2048];
174  CMS_THREAD_SAFE static uint32_t exponenttable[64];
175  CMS_THREAD_SAFE static uint16_t offsettable[64];
176  CMS_THREAD_SAFE static uint16_t basetable[512];
177  CMS_THREAD_SAFE static uint8_t shifttable[512];
178  static void filltables();
179 };
180 #endif
MiniFloatConverter::ReduceMantissaToNbitsRounding::mask
const uint32_t mask
Definition: libminifloat.h:98
MiniFloatConverter::reduceMantissaToNbitsRounding
static float reduceMantissaToNbitsRounding(float f, int bits)
Definition: libminifloat.h:107
f
double f[11][100]
Definition: MuScleFitUtils.cc:78
conv
static HepMC::IO_HEPEVT conv
Definition: BeamHaloProducer.cc:48
MiniFloatConverter::float32to16round
static uint16_t float32to16round(float x)
Slower implementation, but it rounds to avoid biases.
Definition: libminifloat.h:31
MiniFloatConverter::shifttable
static uint8_t shifttable[512]
Definition: libminifloat.h:177
cms::cuda::assert
assert(be >=bs)
MiniFloatConverter::reduceMantissaToNbits
static float reduceMantissaToNbits(const float &f)
Definition: libminifloat.h:49
DDAxes::x
MiniFloatConverter::mantissatable
static uint32_t mantissatable[2048]
Definition: libminifloat.h:173
MiniFloatConverter::ReduceMantissaToNbitsRounding
Definition: libminifloat.h:71
end
#define end
Definition: vmac.h:39
MiniFloatConverter::float32to16crop
static uint16_t float32to16crop(float x)
Fast implementation, but it crops the number so it biases low.
Definition: libminifloat.h:22
MiniFloatConverter::MiniFloatConverter
MiniFloatConverter()
Definition: libminifloat.cc:12
MiniFloatConverter::ReduceMantissaToNbitsRounding::ReduceMantissaToNbitsRounding
ReduceMantissaToNbitsRounding(int bits)
Definition: libminifloat.h:73
test
Definition: SmallWORMDict.h:13
MiniFloatConverter::basetable
static uint16_t basetable[512]
Definition: libminifloat.h:176
MiniFloatConverter
Definition: libminifloat.h:9
HcalDetIdTransform::transform
unsigned transform(const HcalDetId &id, unsigned transformCode)
Definition: HcalDetIdTransform.cc:7
h
CMS_THREAD_SAFE
#define CMS_THREAD_SAFE
Definition: thread_safety_macros.h:4
MiniFloatConverter::reduceMantissaToNbitsRounding
static float reduceMantissaToNbitsRounding(const float &f)
Definition: libminifloat.h:102
MiniFloatConverter::offsettable
static uint16_t offsettable[64]
Definition: libminifloat.h:175
MiniFloatConverter::max32RoundedToMax16
static float max32RoundedToMax16()
Definition: libminifloat.h:126
thread_safety_macros.h
MiniFloatConverter::float16to32
static float float16to32(uint16_t h)
Definition: libminifloat.h:12
MiniFloatConverter::exponenttable
static uint32_t exponenttable[64]
Definition: libminifloat.h:174
bits
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision bits
Definition: EventSelector-behavior.doc:35
compare_using_db.base2
base2
Definition: compare_using_db.py:175
MiniFloatConverter::reduceMantissaToNbits
static float reduceMantissaToNbits(const float &f, int bits)
Definition: libminifloat.h:60
edm::shift
static unsigned const int shift
Definition: LuminosityBlockID.cc:7
MiniFloatConverter::min32RoundedToMin16
static float min32RoundedToMin16()
Definition: libminifloat.h:147
MiniFloatConverter::ReduceMantissaToNbitsRounding::operator()
float operator()(float f) const
Definition: libminifloat.h:77
MiniFloatConverter::reduceMantissaToNbitsRounding
static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out)
Definition: libminifloat.h:112
MillePedeFileConverter_cfg.out
out
Definition: MillePedeFileConverter_cfg.py:31
MiniFloatConverter::ReduceMantissaToNbitsRounding::shift
const int shift
Definition: libminifloat.h:97
MiniFloatConverter::min
static float min()
Definition: libminifloat.h:137
MiniFloatConverter::denorm_min
static float denorm_min()
Definition: libminifloat.h:158
newFWLiteAna.base
base
Definition: newFWLiteAna.py:92
MiniFloatConverter::max
static float max()
Definition: libminifloat.h:116
MiniFloatConverter::filltables
static void filltables()
Definition: libminifloat.cc:20
begin
#define begin
Definition: vmac.h:32
MiniFloatConverter::ReduceMantissaToNbitsRounding::test
const uint32_t test
Definition: libminifloat.h:98
MiniFloatConverter::isdenorm
static bool isdenorm(uint16_t h)
Definition: libminifloat.h:167
MiniFloatConverter::float32to16
static uint16_t float32to16(float x)
Definition: libminifloat.h:20
MiniFloatConverter::ReduceMantissaToNbitsRounding::maxn
const uint32_t maxn
Definition: libminifloat.h:98