1 
2 //
3 // This source file is part of appleseed.
4 // Visit https://appleseedhq.net/ for additional information and resources.
5 //
6 // This software is released under the MIT license.
7 //
8 // Copyright (c) 2018 Francois Beaune, The appleseedhq Organization
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining a copy
11 // of this software and associated documentation files (the "Software"), to deal
12 // in the Software without restriction, including without limitation the rights
13 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 // copies of the Software, and to permit persons to whom the Software is
15 // furnished to do so, subject to the following conditions:
16 //
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
19 //
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 // THE SOFTWARE.
27 //
28 
29 #pragma once
30 
31 // appleseed.foundation headers.
32 #include "foundation/platform/compiler.h"
33 #ifdef APPLESEED_USE_SSE
34 #include "foundation/platform/sse.h"
35 #endif
36 #include "foundation/platform/types.h"
37 
38 // appleseed.main headers.
39 #include "main/dllsymbol.h"
40 
41 // Standard headers.
42 #include <cmath>
43 
44 namespace foundation
45 {
46 
47 //
48 // Half-precision (16-bit) floating-point type.
49 //
50 // This type should be compatible with OpenGL, OpenEXR, and IEEE 754r.
51 // The range is [-65504.0, 65504.0] and the precision is about 1 part
52 // in 2000 (3.3 decimal places).
53 //
54 // From OpenGL spec 2.1.2:
55 //
56 // A 16-bit floating-point number has a 1-bit sign (S), a 5-bit
57 // exponent (E), and a 10-bit mantissa (M). The value of a 16-bit
58 // floating-point number is determined by the following:
59 //
60 //   (-1)^S * 0.0,                          if E == 0 and M == 0,
61 //   (-1)^S * 2^-14 * (M/2^10),             if E == 0 and M != 0,
62 //   (-1)^S * 2^(E-15) * (1 + M/2^10),      if 0 < E < 31,
63 //   (-1)^S * INF,                          if E == 31 and M == 0, or
64 //   NaN                                    if E == 31 and M != 0
65 //
66 
67 class Half
68 {
69   public:
70     // Constructors.
71     Half();                                 // leave half value uninitialized
72     Half(const float rhs);                  // allow implicit float-to-half conversion
73 
74     // Construct a half by directly specifying its bits.
75     static Half from_bits(const uint16 bits);
76 
77     // Get underlying bits.
78     uint16 bits() const;
79 
80     // Implicit float-to-half conversion via assignment.
81     Half& operator=(const float rhs);
82 
83     // Implicit half-to-float conversion.
84     operator float() const;
85 
86   private:
87     APPLESEED_DLLSYMBOL static const uint32 s_h2f_table[65536];
88     APPLESEED_DLLSYMBOL static const uint16 s_f2h_table[512];
89     APPLESEED_DLLSYMBOL static uint16 float_to_half_except(const uint32 i);
90 
91     friend Half float_to_half(const float f);
92     friend float half_to_float(const Half h);
93 
94     uint16 m_bits;
95 };
96 
97 static_assert(sizeof(Half) == 2, "The size of foundation::Half must be exactly 2 bytes");
98 
99 
100 //
101 // Explicit conversion functions.
102 //
103 // The functions
104 //
105 //   foundation::float_to_half()
106 //   foundation::half_to_float()
107 //
108 // were borrowed from Ptex with minor, non-functional changes.
109 // The original license follows:
110 //
111 //   PTEX SOFTWARE
112 //   Copyright 2014 Disney Enterprises, Inc.  All rights reserved
113 //
114 //   Redistribution and use in source and binary forms, with or without
115 //   modification, are permitted provided that the following conditions are
116 //   met:
117 //
118 //     * Redistributions of source code must retain the above copyright
119 //       notice, this list of conditions and the following disclaimer.
120 //
121 //     * Redistributions in binary form must reproduce the above copyright
122 //       notice, this list of conditions and the following disclaimer in
123 //       the documentation and/or other materials provided with the
124 //       distribution.
125 //
126 //     * The names "Disney", "Walt Disney Pictures", "Walt Disney Animation
127 //       Studios" or the names of its contributors may NOT be used to
128 //       endorse or promote products derived from this software without
129 //       specific prior written permission from Walt Disney Pictures.
130 //
131 //   Disclaimer: THIS SOFTWARE IS PROVIDED BY WALT DISNEY PICTURES AND
132 //   CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
133 //   BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
134 //   FOR A PARTICULAR PURPOSE, NONINFRINGEMENT AND TITLE ARE DISCLAIMED.
135 //   IN NO EVENT SHALL WALT DISNEY PICTURES, THE COPYRIGHT HOLDER OR
136 //   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
137 //   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
138 //   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
139 //   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND BASED ON ANY
140 //   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
141 //   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
142 //   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
143 //
144 // The functions
145 //
146 //   foundation::float_to_half_alt()
147 //   foundation::fast_float_to_half()
148 //   foundation::half_to_float_alt()
149 //   foundation::float_to_half() (SSE variant)
150 //   foundation::half_to_float() (SSE variant)
151 //
152 // were borrowed from
153 //
154 //   Float->half variants, by Fabian "ryg" Giesen.
155 //   https://gist.github.com/rygorous/2156668
156 //
157 //   Half->float variants, by Fabian "ryg" Giesen.
158 //   https://gist.github.com/rygorous/2144712
159 //
160 // with minor, non-functional changes.
161 //
162 
163 // Float-to-half conversion function from Ptex.
164 Half float_to_half(const float f);
165 
166 // Alternate implementation by Fabian Giesen (called float_to_half_fast3() in his gist).
167 Half float_to_half_alt(const float f);
168 
169 // Approximate solution by Fabian Giesen (called approx_float_to_half() in his gist).
170 // This is faster but converts some sNaNs to infinity and doesn't round correctly. Handle with care.
171 Half fast_float_to_half(const float f);
172 
173 // Half-to-float conversion function from Ptex.
174 float half_to_float(const Half h);
175 
176 // Alternate implementation by Fabian Giesen (called half_to_float_fast5() in his gist).
177 // Turns FP16 denormals into FP32 denormals. Will be slower if denormals actually occur.
178 float half_to_float_alt(const Half h);
179 
180 #ifdef APPLESEED_USE_SSE
181 
182 __m128i float_to_half(const __m128 f);      // round-half-up (same as ISPC)
183 __m128 half_to_float(const __m128i h);
184 
185 #endif
186 
187 
188 //
189 // Half class implementation.
190 //
191 
Half()192 inline Half::Half()
193 {
194 }
195 
Half(const float rhs)196 inline Half::Half(const float rhs)
197   : m_bits(float_to_half(rhs).m_bits)
198 {
199 }
200 
from_bits(const uint16 bits)201 inline Half Half::from_bits(const uint16 bits)
202 {
203     Half h;
204     h.m_bits = bits;
205     return h;
206 }
207 
bits()208 inline uint16 Half::bits() const
209 {
210     return m_bits;
211 }
212 
213 inline Half& Half::operator=(const float rhs)
214 {
215     m_bits = float_to_half(rhs).m_bits;
216     return *this;
217 }
218 
219 inline Half::operator float() const
220 {
221     return half_to_float(*this);
222 }
223 
224 
225 //
226 // Explicit conversion functions implementation.
227 //
228 
float_to_half(const float f)229 inline Half float_to_half(const float f)
230 {
231 #ifdef APPLESEED_USE_F16C
232 
233     return
234         Half::from_bits(
235             _mm_extract_epi16(
236                 _mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_TO_NEAREST_INT),
237                 0));
238 
239 #else
240 
241     if (f == 0.0f)
242         return Half::from_bits(0);
243 
244     union { uint32 i; float f; } u;
245     u.f = f;
246 
247     const uint16 e = Half::s_f2h_table[(u.i >> 23) & 0x1ff];
248 
249     return e != 0
250         ? Half::from_bits(static_cast<uint16>(e + (((u.i & 0x7fffff) + 0x1000) >> 13)))
251         : Half::from_bits(Half::float_to_half_except(u.i));
252 
253 #endif
254 }
255 
float_to_half_alt(const float fl)256 inline Half float_to_half_alt(const float fl)
257 {
258     union FP32 { uint32 u; float f; };
259 
260     static const FP32 f32infty = { 255 << 23 };
261     static const FP32 f16infty = { 31 << 23 };
262     static const FP32 magic = { 15 << 23 };
263     const unsigned int sign_mask = 0x80000000u;
264     const unsigned int round_mask = ~0xfffu;
265 
266     uint16 o = 0;
267 
268     FP32 f;
269     f.f = fl;
270 
271     const unsigned int sign = f.u & sign_mask;
272     f.u ^= sign;
273 
274     // NOTE all the integer compares in this function can be safely
275     // compiled into signed compares since all operands are below
276     // 0x80000000. Important if you want fast straight SSE2 code
277     // (since there's no unsigned PCMPGTD).
278 
279     if (f.u >= f32infty.u)              // Inf or NaN (all exponent bits set)
280         o = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
281     else                                // (de)normalized number or zero
282     {
283         f.u &= round_mask;
284         f.f *= magic.f;
285         f.u -= round_mask;
286         if (f.u > f16infty.u)           // clamp to signed infinity if overflowed
287             f.u = f16infty.u;
288 
289         o = f.u >> 13;                  // take the bits!
290     }
291 
292     o |= sign >> 16;
293 
294     return Half::from_bits(o);
295 }
296 
fast_float_to_half(const float fl)297 inline Half fast_float_to_half(const float fl)
298 {
299     union FP32 { uint32 u; float f; };
300 
301     static const FP32 f32infty = { 255 << 23 };
302     static const FP32 f16max = { (127 + 16) << 23 };
303     static const FP32 magic = { 15 << 23 };
304     static const FP32 expinf = { (255 ^ 31) << 23 };
305     const unsigned int sign_mask = 0x80000000u;
306 
307     uint16 o = 0;
308 
309     FP32 f;
310     f.f = fl;
311 
312     const unsigned int sign = f.u & sign_mask;
313     f.u ^= sign;
314 
315     if (!(f.f < f32infty.u))            // Inf or NaN
316         o = f.u ^ expinf.u;
317     else
318     {
319         if (f.f > f16max.f)
320             f.f = f16max.f;
321         f.f *= magic.f;
322     }
323 
324     o = f.u >> 13;                      // take the mantissa bits
325     o |= sign >> 16;
326 
327     return Half::from_bits(o);
328 }
329 
half_to_float(const Half h)330 inline float half_to_float(const Half h)
331 {
332 #ifdef APPLESEED_USE_F16C_DISABLED      // currently slower than the table lookup
333 
334     return
335         _mm_cvtss_f32(
336             _mm_cvtph_ps(
337                 _mm_set1_epi16(h.m_bits)));
338 
339 #else
340 
341     union { uint32 i; float f; } u;
342     u.i = Half::s_h2f_table[h.m_bits];
343     return u.f;
344 
345 #endif
346 }
347 
half_to_float_alt(const Half h)348 inline float half_to_float_alt(const Half h)
349 {
350     union FP32 { uint32 u; float f; };
351 
352     static const FP32 magic = { (254 - 15) << 23 };
353     static const FP32 was_infnan = { (127 + 16) << 23 };
354 
355     FP32 o;
356     o.u = (h.bits() & 0x7fff) << 13;    // exponent/mantissa bits
357     o.f *= magic.f;                     // exponent adjust
358     if (o.f >= was_infnan.f)            // make sure Inf/NaN survive
359         o.u |= 255 << 23;
360     o.u |= (h.bits() & 0x8000) << 16;   // sign bit
361 
362     return o.f;
363 }
364 
365 #ifdef APPLESEED_USE_SSE
366 
float_to_half(const __m128 f)367 inline __m128i float_to_half(const __m128 f)
368 {
369 #ifdef APPLESEED_USE_F16C
370 
371     return _mm_cvtps_ph(f, _MM_FROUND_TO_NEAREST_INT);
372 
373 #else
374 
375     const __m128i mask_sign       = _mm_set1_epi32(0x80000000u);
376     const __m128i mask_round      = _mm_set1_epi32(~0xfffu);
377     const __m128i c_f32infty      = _mm_set1_epi32(255 << 23);
378     const __m128i c_magic         = _mm_set1_epi32(15 << 23);
379     const __m128i c_nanbit        = _mm_set1_epi32(0x200);
380     const __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
381     const __m128i c_clamp         = _mm_set1_epi32((31 << 23) - 0x1000);
382 
383     const __m128  msign           = _mm_castsi128_ps(mask_sign);
384     const __m128  justsign        = _mm_and_ps(msign, f);
385     const __m128i f32infty        = c_f32infty;
386     const __m128  absf            = _mm_xor_ps(f, justsign);
387     const __m128  mround          = _mm_castsi128_ps(mask_round);
388     const __m128i absf_int        = _mm_castps_si128(absf); // pseudo-op, but val needs to be copied once so count as mov
389     const __m128i b_isnan         = _mm_cmpgt_epi32(absf_int, f32infty);
390     const __m128i b_isnormal      = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf));
391     const __m128i nanbit          = _mm_and_si128(b_isnan, c_nanbit);
392     const __m128i inf_or_nan      = _mm_or_si128(nanbit, c_infty_as_fp16);
393 
394     const __m128  fnosticky       = _mm_and_ps(absf, mround);
395     const __m128  scaled          = _mm_mul_ps(fnosticky, _mm_castsi128_ps(c_magic));
396     const __m128  clamped         = _mm_min_ps(scaled, _mm_castsi128_ps(c_clamp)); // logically, we want PMINSD on "biased", but this should gen better code
397     const __m128i biased          = _mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround));
398     const __m128i shifted         = _mm_srli_epi32(biased, 13);
399     const __m128i normal          = _mm_and_si128(shifted, b_isnormal);
400     const __m128i not_normal      = _mm_andnot_si128(b_isnormal, inf_or_nan);
401     const __m128i joined          = _mm_or_si128(normal, not_normal);
402 
403     const __m128i sign_shift      = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
404     const __m128i final           = _mm_or_si128(joined, sign_shift);
405 
406     // ~20 SSE2 ops
407     return final;
408 
409 #endif
410 }
411 
half_to_float(const __m128i h)412 inline __m128 half_to_float(const __m128i h)
413 {
414 #ifdef APPLESEED_USE_F16C
415 
416     return _mm_cvtph_ps(h);
417 
418 #else
419 
420 #define SSE_CONST4(name, val) static const APPLESEED_SIMD4_ALIGN unsigned int name[4] = { (val), (val), (val), (val) }
421 
422     SSE_CONST4(mask_nosign, 0x7fff);
423     SSE_CONST4(magic,       (254 - 15) << 23);
424     SSE_CONST4(was_infnan,  0x7bff);
425     SSE_CONST4(exp_infnan,  255 << 23);
426 
427     const __m128i mnosign         = *(const __m128i*)(&mask_nosign);
428     const __m128i expmant         = _mm_and_si128(mnosign, h);
429     const __m128i justsign        = _mm_xor_si128(h, expmant);
430     const __m128i expmant2        = expmant; // copy (just here for counting purposes)
431     const __m128i shifted         = _mm_slli_epi32(expmant, 13);
432     const __m128  scaled          = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
433     const __m128i b_wasinfnan     = _mm_cmpgt_epi32(expmant2, *(const __m128i*)(&was_infnan));
434     const __m128i sign            = _mm_slli_epi32(justsign, 16);
435     const __m128  infnanexp       = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), *(const __m128*)(&exp_infnan));
436     const __m128  sign_inf        = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
437     const __m128  final           = _mm_or_ps(scaled, sign_inf);
438 
439     // ~11 SSE2 ops.
440     return final;
441 
442 #undef SSE_CONST4
443 
444 #endif
445 }
446 
447 #endif  // APPLESEED_USE_SSE
448 
449 }   // namespace foundation
450