Go to the documentation of this file.
39 uint16_t
base2 = (
conv.i32 & 0x007fffff) >> 12;
50 static_assert(bits <= 23,
"max mantissa size is 23 bits");
51 constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
61 uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
78 constexpr uint32_t low23 = (0x007FFFFF);
79 constexpr uint32_t hi9 = (0xFF800000);
86 uint32_t mantissa = (
conv.i32 & low23) >>
shift;
111 template <
typename InItr,
typename OutItr>
116 inline static float max() {
121 conv.i32 = 0x477fe000;
133 conv.i32 = (0x8f << 23) - 1;
137 inline static float min() {
142 conv.i32 = 0x38800000;
154 conv.i32 = (0x71 << 23);
163 conv.i32 = 0x33800000;
169 return ((
h >> 10) & 0x1f) == 0 && (
h & 0x3ff) != 0;
static float reduceMantissaToNbitsRounding(float f, int bits)
static uint16_t float32to16round(float x)
Slower implementation, but it rounds to avoid biases.
static uint8_t shifttable[512]
static float reduceMantissaToNbits(const float &f)
static uint32_t mantissatable[2048]
static uint16_t float32to16crop(float x)
Fast implementation, but it crops the number so it biases low.
ReduceMantissaToNbitsRounding(int bits)
static uint16_t basetable[512]
static float reduceMantissaToNbitsRounding(const float &f)
static uint16_t offsettable[64]
static float max32RoundedToMax16()
static float float16to32(uint16_t h)
static uint32_t exponenttable[64]
static float reduceMantissaToNbits(const float &f, int bits)
static unsigned const int shift
static float min32RoundedToMin16()
float operator()(float f) const
static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out)
static float denorm_min()
static bool isdenorm(uint16_t h)
static uint16_t float32to16(float x)