1 /*
2  * C-implementation of Float16 - https://github.com/Maratyszcza/FP16
3  *
4  * Version: 0a92994d729ff76a58f692d3028ca1b64b145d91
5  *
6  * The MIT License (MIT)
7  *
8  * Copyright (c) 2017 Facebook Inc.
9  * Copyright (c) 2017 Georgia Institute of Technology
10  * Copyright 2019 Google LLC
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a copy of this
13  * software and associated documentation files (the "Software"), to deal in the Software
14  * without restriction, including without limitation the rights to use, copy, modify,
15  * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
16  * permit persons to whom the Software is furnished to do so, subject to the following
17  * conditions:
18  *
19  * The above copyright notice and this permission notice shall be included in all copies
20  * or substantial portions of the Software.
21  *
22  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
23  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
24  * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
25  * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
26  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29 
30 /*
31  * The following changes were done to adapt this to Erlang/OTP conventions:
32  *
33  * 1. uint16_t and uint32_t have been rewritten to Uint16 and Uint32
34  * 2. Mixed declarations were moved to the top to avoid warnings
35  * 3. inline was rewritten as ERTS_INLINE
36  * 4. UINT16_C(x) and UINT32_C(x) were rewriten to xU as we don't support 16bits platform
37  */
38 
fp32_from_bits(Uint32 w)39 static ERTS_INLINE float fp32_from_bits(Uint32 w) {
40     union {
41 	Uint32 as_bits;
42 	float as_value;
43     } fp32 = { w };
44     return fp32.as_value;
45 }
46 
fp32_to_bits(float f)47 static ERTS_INLINE Uint32 fp32_to_bits(float f) {
48     union {
49 	float as_value;
50 	Uint32 as_bits;
51     } fp32 = { f };
52     return fp32.as_bits;
53 }
54 
55 /*
56  * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
57  * IEEE half-precision format, in bit representation.
58  *
59  * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
60  * floating-point operations and bitcasts between integer and floating-point variables.
61  */
fp16_ieee_from_fp32_value(float f)62 static ERTS_INLINE Uint16 fp16_ieee_from_fp32_value(float f) {
63 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
64     const float scale_to_inf = 0x1.0p+112f;
65     const float scale_to_zero = 0x1.0p-110f;
66 #else
67     const float scale_to_inf = fp32_from_bits(0x77800000U);
68     const float scale_to_zero = fp32_from_bits(0x08800000U);
69 #endif
70     float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
71 
72     const Uint32 w = fp32_to_bits(f);
73     const Uint32 shl1_w = w + w;
74     const Uint32 sign = w & 0x80000000U;
75 
76     Uint32 bits, exp_bits, mantissa_bits, nonsign, bias;
77 
78     bias = shl1_w & 0xFF000000U;
79 
80     if (bias < 0x71000000U) {
81 	bias = 0x71000000U;
82     }
83 
84     base = fp32_from_bits((bias >> 1) + 0x07800000U) + base;
85     bits = fp32_to_bits(base);
86     exp_bits = (bits >> 13) & 0x00007C00U;
87     mantissa_bits = bits & 0x00000FFFU;
88     nonsign = exp_bits + mantissa_bits;
89     return (sign >> 16) | (shl1_w > 0xFF000000U ? 0x7E00U : nonsign);
90 }
91 
92 /*
93  * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
94  * a 32-bit floating-point number in IEEE single-precision format.
95  *
96  * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
97  * floating-point operations and bitcasts between integer and floating-point variables.
98  */
fp16_ieee_to_fp32_value(Uint16 h)99 static ERTS_INLINE float fp16_ieee_to_fp32_value(Uint16 h) {
100     /*
101      * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
102      *      +---+-----+------------+-------------------+
103      *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
104      *      +---+-----+------------+-------------------+
105      * Bits  31  26-30    16-25            0-15
106      *
107      * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
108      */
109     const Uint32 w = (Uint32) h << 16;
110     /*
111      * Extract the sign of the input number into the high bit of the 32-bit word:
112      *
113      *      +---+----------------------------------+
114      *      | S |0000000 00000000 00000000 00000000|
115      *      +---+----------------------------------+
116      * Bits  31                 0-31
117      */
118     const Uint32 sign = w & 0x80000000U;
119     /*
120      * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
121      *
122      *      +-----+------------+---------------------+
123      *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
124      *      +-----+------------+---------------------+
125      * Bits  27-31    17-26            0-16
126      */
127     const Uint32 two_w = w + w;
128 
129     /*
130      * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
131      * of a single-precision floating-point number:
132      *
133      *       S|Exponent |          Mantissa
134      *      +-+---+-----+------------+----------------+
135      *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
136      *      +-+---+-----+------------+----------------+
137      * Bits   | 23-31   |           0-22
138      *
139      * Next, there are some adjustments to the exponent:
140      * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
141      *   formats (0x7F - 0xF = 0x70)
142      * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
143      *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
144      *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
145      *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
146      *     by the difference in the exponent bias (see above).
147      *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
148      *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
149      *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
150      *     partially IEEE754-compliant implementations.
151      *
152      * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
153      * operate on denormal inputs, and do not produce denormal results.
154      */
155     const Uint32 exp_offset = 0xE0U << 23;
156 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
157     const float exp_scale = 0x1.0p-112f;
158 #else
159     const float exp_scale = fp32_from_bits(0x7800000U);
160 #endif
161     const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
162 
163     /*
164      * Convert denormalized half-precision inputs into single-precision results (always normalized).
165      * Zero inputs are also handled here.
166      *
167      * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
168      * First, we shift mantissa into bits 0-9 of the 32-bit word.
169      *
170      *                  zeros           |  mantissa
171      *      +---------------------------+------------+
172      *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
173      *      +---------------------------+------------+
174      * Bits             10-31                0-9
175      *
176      * Now, remember that denormalized half-precision numbers are represented as:
177      *    FP16 = mantissa * 2**(-24).
178      * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
179      * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
180      * A normalized single-precision floating-point number is represented as:
181      *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
182      * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
183      * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
184      *
185      * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
186      * is zero, the constructed single-precision number has the value of
187      *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
188      * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
189      * the input half-precision number.
190      */
191     const Uint32 magic_mask = 126U << 23;
192     const float magic_bias = 0.5f;
193     const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
194 
195     /*
196      * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
197      *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
198      *   input is either a denormal number, or zero.
199      * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
200      */
201     const Uint32 denormalized_cutoff = 1U << 27;
202     const Uint32 result = sign |
203 	(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
204     return fp32_from_bits(result);
205 }
206