1 #pragma once
2 
3 /*
4  * This code snippet posted by user Phernost on
5  * https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
6  *
7  * compress and decompress methods are made "inline" for performance
8  */
9 
10 class Float16Compressor
11 {
12 	union Bits
13 	{
14 		float f;
15 		int32_t si;
16 		uint32_t ui;
17 	};
18 
19 	static int const shift = 13;
20 	static int const shiftSign = 16;
21 
22 	static int32_t const infN = 0x7F800000; // flt32 infinity
23 	static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
24 	static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
25 	static int32_t const signN = 0x80000000; // flt32 sign bit
26 
27 	static int32_t const infC = infN >> shift;
28 	static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
29 	static int32_t const maxC = maxN >> shift;
30 	static int32_t const minC = minN >> shift;
31 	static int32_t const signC = signN >> shiftSign; // flt16 sign bit
32 
33 	static int32_t const mulN = 0x52000000; // (1 << 23) / minN
34 	static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
35 
36 	static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
37 	static int32_t const norC = 0x00400; // min flt32 normal down shifted
38 
39 	static int32_t const maxD = infC - maxC - 1;
40 	static int32_t const minD = minC - subC - 1;
41 
42 public:
43 
compress(float value)44 	inline static uint16_t compress(float value)
45 	{
46 		Bits v, s;
47 		v.f = value;
48 		uint32_t sign = v.si & signN;
49 		v.si ^= sign;
50 		sign >>= shiftSign; // logical shift
51 		s.si = mulN;
52 		s.si = s.f * v.f; // correct subnormals
53 		v.si ^= (s.si ^ v.si) & -(minN > v.si);
54 		v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
55 		v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
56 		v.ui >>= shift; // logical shift
57 		v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
58 		v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
59 		return v.ui | sign;
60 	}
61 
decompress(uint16_t value)62 	inline static float decompress(uint16_t value)
63 	{
64 		Bits v;
65 		v.ui = value;
66 		int32_t sign = v.si & signC;
67 		v.si ^= sign;
68 		sign <<= shiftSign;
69 		v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
70 		v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
71 		Bits s;
72 		s.si = mulC;
73 		s.f *= v.si;
74 		int32_t mask = -(norC > v.si);
75 		v.si <<= shift;
76 		v.si ^= (s.si ^ v.si) & mask;
77 		v.si |= sign;
78 		return v.f;
79 	}
80 };