1 /*
2 * C-implementation of Float16 - https://github.com/Maratyszcza/FP16
3 *
4 * Version: 0a92994d729ff76a58f692d3028ca1b64b145d91
5 *
6 * The MIT License (MIT)
7 *
8 * Copyright (c) 2017 Facebook Inc.
9 * Copyright (c) 2017 Georgia Institute of Technology
10 * Copyright 2019 Google LLC
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining a copy of this
13 * software and associated documentation files (the "Software"), to deal in the Software
14 * without restriction, including without limitation the rights to use, copy, modify,
15 * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
16 * permit persons to whom the Software is furnished to do so, subject to the following
17 * conditions:
18 *
19 * The above copyright notice and this permission notice shall be included in all copies
20 * or substantial portions of the Software.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
23 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
24 * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
25 * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
26 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 */
29
30 /*
31 * The following changes were done to adapt this to Erlang/OTP conventions:
32 *
33 * 1. uint16_t and uint32_t have been rewritten to Uint16 and Uint32
34 * 2. Mixed declarations were moved to the top to avoid warnings
35 * 3. inline was rewritten as ERTS_INLINE
36 * 4. UINT16_C(x) and UINT32_C(x) were rewriten to xU as we don't support 16bits platform
37 */
38
fp32_from_bits(Uint32 w)39 static ERTS_INLINE float fp32_from_bits(Uint32 w) {
40 union {
41 Uint32 as_bits;
42 float as_value;
43 } fp32 = { w };
44 return fp32.as_value;
45 }
46
fp32_to_bits(float f)47 static ERTS_INLINE Uint32 fp32_to_bits(float f) {
48 union {
49 float as_value;
50 Uint32 as_bits;
51 } fp32 = { f };
52 return fp32.as_bits;
53 }
54
55 /*
56 * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
57 * IEEE half-precision format, in bit representation.
58 *
59 * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
60 * floating-point operations and bitcasts between integer and floating-point variables.
61 */
fp16_ieee_from_fp32_value(float f)62 static ERTS_INLINE Uint16 fp16_ieee_from_fp32_value(float f) {
63 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
64 const float scale_to_inf = 0x1.0p+112f;
65 const float scale_to_zero = 0x1.0p-110f;
66 #else
67 const float scale_to_inf = fp32_from_bits(0x77800000U);
68 const float scale_to_zero = fp32_from_bits(0x08800000U);
69 #endif
70 float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
71
72 const Uint32 w = fp32_to_bits(f);
73 const Uint32 shl1_w = w + w;
74 const Uint32 sign = w & 0x80000000U;
75
76 Uint32 bits, exp_bits, mantissa_bits, nonsign, bias;
77
78 bias = shl1_w & 0xFF000000U;
79
80 if (bias < 0x71000000U) {
81 bias = 0x71000000U;
82 }
83
84 base = fp32_from_bits((bias >> 1) + 0x07800000U) + base;
85 bits = fp32_to_bits(base);
86 exp_bits = (bits >> 13) & 0x00007C00U;
87 mantissa_bits = bits & 0x00000FFFU;
88 nonsign = exp_bits + mantissa_bits;
89 return (sign >> 16) | (shl1_w > 0xFF000000U ? 0x7E00U : nonsign);
90 }
91
92 /*
93 * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
94 * a 32-bit floating-point number in IEEE single-precision format.
95 *
96 * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
97 * floating-point operations and bitcasts between integer and floating-point variables.
98 */
fp16_ieee_to_fp32_value(Uint16 h)99 static ERTS_INLINE float fp16_ieee_to_fp32_value(Uint16 h) {
100 /*
101 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
102 * +---+-----+------------+-------------------+
103 * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
104 * +---+-----+------------+-------------------+
105 * Bits 31 26-30 16-25 0-15
106 *
107 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
108 */
109 const Uint32 w = (Uint32) h << 16;
110 /*
111 * Extract the sign of the input number into the high bit of the 32-bit word:
112 *
113 * +---+----------------------------------+
114 * | S |0000000 00000000 00000000 00000000|
115 * +---+----------------------------------+
116 * Bits 31 0-31
117 */
118 const Uint32 sign = w & 0x80000000U;
119 /*
120 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
121 *
122 * +-----+------------+---------------------+
123 * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
124 * +-----+------------+---------------------+
125 * Bits 27-31 17-26 0-16
126 */
127 const Uint32 two_w = w + w;
128
129 /*
130 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
131 * of a single-precision floating-point number:
132 *
133 * S|Exponent | Mantissa
134 * +-+---+-----+------------+----------------+
135 * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
136 * +-+---+-----+------------+----------------+
137 * Bits | 23-31 | 0-22
138 *
139 * Next, there are some adjustments to the exponent:
140 * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
141 * formats (0x7F - 0xF = 0x70)
142 * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
143 * Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
144 * of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
145 * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
146 * by the difference in the exponent bias (see above).
147 * - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
148 * exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
149 * The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
150 * partially IEEE754-compliant implementations.
151 *
152 * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
153 * operate on denormal inputs, and do not produce denormal results.
154 */
155 const Uint32 exp_offset = 0xE0U << 23;
156 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
157 const float exp_scale = 0x1.0p-112f;
158 #else
159 const float exp_scale = fp32_from_bits(0x7800000U);
160 #endif
161 const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
162
163 /*
164 * Convert denormalized half-precision inputs into single-precision results (always normalized).
165 * Zero inputs are also handled here.
166 *
167 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
168 * First, we shift mantissa into bits 0-9 of the 32-bit word.
169 *
170 * zeros | mantissa
171 * +---------------------------+------------+
172 * |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
173 * +---------------------------+------------+
174 * Bits 10-31 0-9
175 *
176 * Now, remember that denormalized half-precision numbers are represented as:
177 * FP16 = mantissa * 2**(-24).
178 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
179 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
180 * A normalized single-precision floating-point number is represented as:
181 * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
182 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
183 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
184 *
185 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
186 * is zero, the constructed single-precision number has the value of
187 * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
188 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
189 * the input half-precision number.
190 */
191 const Uint32 magic_mask = 126U << 23;
192 const float magic_bias = 0.5f;
193 const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
194
195 /*
196 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
197 * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
198 * input is either a denormal number, or zero.
199 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
200 */
201 const Uint32 denormalized_cutoff = 1U << 27;
202 const Uint32 result = sign |
203 (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
204 return fp32_from_bits(result);
205 }
206