1 #pragma once 2 3 /* 4 * This code snippet posted by user Phernost on 5 * https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 6 * 7 * compress and decompress methods are made "inline" for performance 8 */ 9 10 class Float16Compressor 11 { 12 union Bits 13 { 14 float f; 15 int32_t si; 16 uint32_t ui; 17 }; 18 19 static int const shift = 13; 20 static int const shiftSign = 16; 21 22 static int32_t const infN = 0x7F800000; // flt32 infinity 23 static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32 24 static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 25 static int32_t const signN = 0x80000000; // flt32 sign bit 26 27 static int32_t const infC = infN >> shift; 28 static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 29 static int32_t const maxC = maxN >> shift; 30 static int32_t const minC = minN >> shift; 31 static int32_t const signC = signN >> shiftSign; // flt16 sign bit 32 33 static int32_t const mulN = 0x52000000; // (1 << 23) / minN 34 static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) 35 36 static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted 37 static int32_t const norC = 0x00400; // min flt32 normal down shifted 38 39 static int32_t const maxD = infC - maxC - 1; 40 static int32_t const minD = minC - subC - 1; 41 42 public: 43 compress(float value)44 inline static uint16_t compress(float value) 45 { 46 Bits v, s; 47 v.f = value; 48 uint32_t sign = v.si & signN; 49 v.si ^= sign; 50 sign >>= shiftSign; // logical shift 51 s.si = mulN; 52 s.si = s.f * v.f; // correct subnormals 53 v.si ^= (s.si ^ v.si) & -(minN > v.si); 54 v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); 55 v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); 56 v.ui >>= shift; // logical shift 57 v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); 58 v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); 59 return v.ui | sign; 60 } 61 decompress(uint16_t value)62 inline static float decompress(uint16_t value) 63 { 64 Bits v; 65 v.ui = value; 66 int32_t sign = v.si & signC; 67 v.si ^= sign; 68 sign <<= shiftSign; 69 v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); 70 v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); 71 Bits s; 72 s.si = mulC; 73 s.f *= v.si; 74 int32_t mask = -(norC > v.si); 75 v.si <<= shift; 76 v.si ^= (s.si ^ v.si) & mask; 77 v.si |= sign; 78 return v.f; 79 } 80 };