1
2 //
3 // This source file is part of appleseed.
4 // Visit https://appleseedhq.net/ for additional information and resources.
5 //
6 // This software is released under the MIT license.
7 //
8 // Copyright (c) 2018 Francois Beaune, The appleseedhq Organization
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining a copy
11 // of this software and associated documentation files (the "Software"), to deal
12 // in the Software without restriction, including without limitation the rights
13 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 // copies of the Software, and to permit persons to whom the Software is
15 // furnished to do so, subject to the following conditions:
16 //
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
19 //
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 // THE SOFTWARE.
27 //
28
29 #pragma once
30
31 // appleseed.foundation headers.
32 #include "foundation/platform/compiler.h"
33 #ifdef APPLESEED_USE_SSE
34 #include "foundation/platform/sse.h"
35 #endif
36 #include "foundation/platform/types.h"
37
38 // appleseed.main headers.
39 #include "main/dllsymbol.h"
40
41 // Standard headers.
42 #include <cmath>
43
44 namespace foundation
45 {
46
47 //
48 // Half-precision (16-bit) floating-point type.
49 //
50 // This type should be compatible with OpenGL, OpenEXR, and IEEE 754r.
51 // The range is [-65504.0, 65504.0] and the precision is about 1 part
52 // in 2000 (3.3 decimal places).
53 //
54 // From OpenGL spec 2.1.2:
55 //
56 // A 16-bit floating-point number has a 1-bit sign (S), a 5-bit
57 // exponent (E), and a 10-bit mantissa (M). The value of a 16-bit
58 // floating-point number is determined by the following:
59 //
60 // (-1)^S * 0.0, if E == 0 and M == 0,
61 // (-1)^S * 2^-14 * (M/2^10), if E == 0 and M != 0,
62 // (-1)^S * 2^(E-15) * (1 + M/2^10), if 0 < E < 31,
63 // (-1)^S * INF, if E == 31 and M == 0, or
64 // NaN if E == 31 and M != 0
65 //
66
67 class Half
68 {
69 public:
70 // Constructors.
71 Half(); // leave half value uninitialized
72 Half(const float rhs); // allow implicit float-to-half conversion
73
74 // Construct a half by directly specifying its bits.
75 static Half from_bits(const uint16 bits);
76
77 // Get underlying bits.
78 uint16 bits() const;
79
80 // Implicit float-to-half conversion via assignment.
81 Half& operator=(const float rhs);
82
83 // Implicit half-to-float conversion.
84 operator float() const;
85
86 private:
87 APPLESEED_DLLSYMBOL static const uint32 s_h2f_table[65536];
88 APPLESEED_DLLSYMBOL static const uint16 s_f2h_table[512];
89 APPLESEED_DLLSYMBOL static uint16 float_to_half_except(const uint32 i);
90
91 friend Half float_to_half(const float f);
92 friend float half_to_float(const Half h);
93
94 uint16 m_bits;
95 };
96
97 static_assert(sizeof(Half) == 2, "The size of foundation::Half must be exactly 2 bytes");
98
99
100 //
101 // Explicit conversion functions.
102 //
103 // The functions
104 //
105 // foundation::float_to_half()
106 // foundation::half_to_float()
107 //
108 // were borrowed from Ptex with minor, non-functional changes.
109 // The original license follows:
110 //
111 // PTEX SOFTWARE
112 // Copyright 2014 Disney Enterprises, Inc. All rights reserved
113 //
114 // Redistribution and use in source and binary forms, with or without
115 // modification, are permitted provided that the following conditions are
116 // met:
117 //
118 // * Redistributions of source code must retain the above copyright
119 // notice, this list of conditions and the following disclaimer.
120 //
121 // * Redistributions in binary form must reproduce the above copyright
122 // notice, this list of conditions and the following disclaimer in
123 // the documentation and/or other materials provided with the
124 // distribution.
125 //
126 // * The names "Disney", "Walt Disney Pictures", "Walt Disney Animation
127 // Studios" or the names of its contributors may NOT be used to
128 // endorse or promote products derived from this software without
129 // specific prior written permission from Walt Disney Pictures.
130 //
131 // Disclaimer: THIS SOFTWARE IS PROVIDED BY WALT DISNEY PICTURES AND
132 // CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
133 // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
134 // FOR A PARTICULAR PURPOSE, NONINFRINGEMENT AND TITLE ARE DISCLAIMED.
135 // IN NO EVENT SHALL WALT DISNEY PICTURES, THE COPYRIGHT HOLDER OR
136 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
137 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
138 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
139 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND BASED ON ANY
140 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
141 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
142 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
143 //
144 // The functions
145 //
146 // foundation::float_to_half_alt()
147 // foundation::fast_float_to_half()
148 // foundation::half_to_float_alt()
149 // foundation::float_to_half() (SSE variant)
150 // foundation::half_to_float() (SSE variant)
151 //
152 // were borrowed from
153 //
154 // Float->half variants, by Fabian "ryg" Giesen.
155 // https://gist.github.com/rygorous/2156668
156 //
157 // Half->float variants, by Fabian "ryg" Giesen.
158 // https://gist.github.com/rygorous/2144712
159 //
160 // with minor, non-functional changes.
161 //
162
163 // Float-to-half conversion function from Ptex.
164 Half float_to_half(const float f);
165
166 // Alternate implementation by Fabian Giesen (called float_to_half_fast3() in his gist).
167 Half float_to_half_alt(const float f);
168
169 // Approximate solution by Fabian Giesen (called approx_float_to_half() in his gist).
170 // This is faster but converts some sNaNs to infinity and doesn't round correctly. Handle with care.
171 Half fast_float_to_half(const float f);
172
173 // Half-to-float conversion function from Ptex.
174 float half_to_float(const Half h);
175
176 // Alternate implementation by Fabian Giesen (called half_to_float_fast5() in his gist).
177 // Turns FP16 denormals into FP32 denormals. Will be slower if denormals actually occur.
178 float half_to_float_alt(const Half h);
179
180 #ifdef APPLESEED_USE_SSE
181
182 __m128i float_to_half(const __m128 f); // round-half-up (same as ISPC)
183 __m128 half_to_float(const __m128i h);
184
185 #endif
186
187
188 //
189 // Half class implementation.
190 //
191
Half()192 inline Half::Half()
193 {
194 }
195
Half(const float rhs)196 inline Half::Half(const float rhs)
197 : m_bits(float_to_half(rhs).m_bits)
198 {
199 }
200
from_bits(const uint16 bits)201 inline Half Half::from_bits(const uint16 bits)
202 {
203 Half h;
204 h.m_bits = bits;
205 return h;
206 }
207
bits()208 inline uint16 Half::bits() const
209 {
210 return m_bits;
211 }
212
213 inline Half& Half::operator=(const float rhs)
214 {
215 m_bits = float_to_half(rhs).m_bits;
216 return *this;
217 }
218
219 inline Half::operator float() const
220 {
221 return half_to_float(*this);
222 }
223
224
225 //
226 // Explicit conversion functions implementation.
227 //
228
float_to_half(const float f)229 inline Half float_to_half(const float f)
230 {
231 #ifdef APPLESEED_USE_F16C
232
233 return
234 Half::from_bits(
235 _mm_extract_epi16(
236 _mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_TO_NEAREST_INT),
237 0));
238
239 #else
240
241 if (f == 0.0f)
242 return Half::from_bits(0);
243
244 union { uint32 i; float f; } u;
245 u.f = f;
246
247 const uint16 e = Half::s_f2h_table[(u.i >> 23) & 0x1ff];
248
249 return e != 0
250 ? Half::from_bits(static_cast<uint16>(e + (((u.i & 0x7fffff) + 0x1000) >> 13)))
251 : Half::from_bits(Half::float_to_half_except(u.i));
252
253 #endif
254 }
255
float_to_half_alt(const float fl)256 inline Half float_to_half_alt(const float fl)
257 {
258 union FP32 { uint32 u; float f; };
259
260 static const FP32 f32infty = { 255 << 23 };
261 static const FP32 f16infty = { 31 << 23 };
262 static const FP32 magic = { 15 << 23 };
263 const unsigned int sign_mask = 0x80000000u;
264 const unsigned int round_mask = ~0xfffu;
265
266 uint16 o = 0;
267
268 FP32 f;
269 f.f = fl;
270
271 const unsigned int sign = f.u & sign_mask;
272 f.u ^= sign;
273
274 // NOTE all the integer compares in this function can be safely
275 // compiled into signed compares since all operands are below
276 // 0x80000000. Important if you want fast straight SSE2 code
277 // (since there's no unsigned PCMPGTD).
278
279 if (f.u >= f32infty.u) // Inf or NaN (all exponent bits set)
280 o = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
281 else // (de)normalized number or zero
282 {
283 f.u &= round_mask;
284 f.f *= magic.f;
285 f.u -= round_mask;
286 if (f.u > f16infty.u) // clamp to signed infinity if overflowed
287 f.u = f16infty.u;
288
289 o = f.u >> 13; // take the bits!
290 }
291
292 o |= sign >> 16;
293
294 return Half::from_bits(o);
295 }
296
fast_float_to_half(const float fl)297 inline Half fast_float_to_half(const float fl)
298 {
299 union FP32 { uint32 u; float f; };
300
301 static const FP32 f32infty = { 255 << 23 };
302 static const FP32 f16max = { (127 + 16) << 23 };
303 static const FP32 magic = { 15 << 23 };
304 static const FP32 expinf = { (255 ^ 31) << 23 };
305 const unsigned int sign_mask = 0x80000000u;
306
307 uint16 o = 0;
308
309 FP32 f;
310 f.f = fl;
311
312 const unsigned int sign = f.u & sign_mask;
313 f.u ^= sign;
314
315 if (!(f.f < f32infty.u)) // Inf or NaN
316 o = f.u ^ expinf.u;
317 else
318 {
319 if (f.f > f16max.f)
320 f.f = f16max.f;
321 f.f *= magic.f;
322 }
323
324 o = f.u >> 13; // take the mantissa bits
325 o |= sign >> 16;
326
327 return Half::from_bits(o);
328 }
329
half_to_float(const Half h)330 inline float half_to_float(const Half h)
331 {
332 #ifdef APPLESEED_USE_F16C_DISABLED // currently slower than the table lookup
333
334 return
335 _mm_cvtss_f32(
336 _mm_cvtph_ps(
337 _mm_set1_epi16(h.m_bits)));
338
339 #else
340
341 union { uint32 i; float f; } u;
342 u.i = Half::s_h2f_table[h.m_bits];
343 return u.f;
344
345 #endif
346 }
347
half_to_float_alt(const Half h)348 inline float half_to_float_alt(const Half h)
349 {
350 union FP32 { uint32 u; float f; };
351
352 static const FP32 magic = { (254 - 15) << 23 };
353 static const FP32 was_infnan = { (127 + 16) << 23 };
354
355 FP32 o;
356 o.u = (h.bits() & 0x7fff) << 13; // exponent/mantissa bits
357 o.f *= magic.f; // exponent adjust
358 if (o.f >= was_infnan.f) // make sure Inf/NaN survive
359 o.u |= 255 << 23;
360 o.u |= (h.bits() & 0x8000) << 16; // sign bit
361
362 return o.f;
363 }
364
365 #ifdef APPLESEED_USE_SSE
366
float_to_half(const __m128 f)367 inline __m128i float_to_half(const __m128 f)
368 {
369 #ifdef APPLESEED_USE_F16C
370
371 return _mm_cvtps_ph(f, _MM_FROUND_TO_NEAREST_INT);
372
373 #else
374
375 const __m128i mask_sign = _mm_set1_epi32(0x80000000u);
376 const __m128i mask_round = _mm_set1_epi32(~0xfffu);
377 const __m128i c_f32infty = _mm_set1_epi32(255 << 23);
378 const __m128i c_magic = _mm_set1_epi32(15 << 23);
379 const __m128i c_nanbit = _mm_set1_epi32(0x200);
380 const __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
381 const __m128i c_clamp = _mm_set1_epi32((31 << 23) - 0x1000);
382
383 const __m128 msign = _mm_castsi128_ps(mask_sign);
384 const __m128 justsign = _mm_and_ps(msign, f);
385 const __m128i f32infty = c_f32infty;
386 const __m128 absf = _mm_xor_ps(f, justsign);
387 const __m128 mround = _mm_castsi128_ps(mask_round);
388 const __m128i absf_int = _mm_castps_si128(absf); // pseudo-op, but val needs to be copied once so count as mov
389 const __m128i b_isnan = _mm_cmpgt_epi32(absf_int, f32infty);
390 const __m128i b_isnormal = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf));
391 const __m128i nanbit = _mm_and_si128(b_isnan, c_nanbit);
392 const __m128i inf_or_nan = _mm_or_si128(nanbit, c_infty_as_fp16);
393
394 const __m128 fnosticky = _mm_and_ps(absf, mround);
395 const __m128 scaled = _mm_mul_ps(fnosticky, _mm_castsi128_ps(c_magic));
396 const __m128 clamped = _mm_min_ps(scaled, _mm_castsi128_ps(c_clamp)); // logically, we want PMINSD on "biased", but this should gen better code
397 const __m128i biased = _mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround));
398 const __m128i shifted = _mm_srli_epi32(biased, 13);
399 const __m128i normal = _mm_and_si128(shifted, b_isnormal);
400 const __m128i not_normal = _mm_andnot_si128(b_isnormal, inf_or_nan);
401 const __m128i joined = _mm_or_si128(normal, not_normal);
402
403 const __m128i sign_shift = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
404 const __m128i final = _mm_or_si128(joined, sign_shift);
405
406 // ~20 SSE2 ops
407 return final;
408
409 #endif
410 }
411
half_to_float(const __m128i h)412 inline __m128 half_to_float(const __m128i h)
413 {
414 #ifdef APPLESEED_USE_F16C
415
416 return _mm_cvtph_ps(h);
417
418 #else
419
420 #define SSE_CONST4(name, val) static const APPLESEED_SIMD4_ALIGN unsigned int name[4] = { (val), (val), (val), (val) }
421
422 SSE_CONST4(mask_nosign, 0x7fff);
423 SSE_CONST4(magic, (254 - 15) << 23);
424 SSE_CONST4(was_infnan, 0x7bff);
425 SSE_CONST4(exp_infnan, 255 << 23);
426
427 const __m128i mnosign = *(const __m128i*)(&mask_nosign);
428 const __m128i expmant = _mm_and_si128(mnosign, h);
429 const __m128i justsign = _mm_xor_si128(h, expmant);
430 const __m128i expmant2 = expmant; // copy (just here for counting purposes)
431 const __m128i shifted = _mm_slli_epi32(expmant, 13);
432 const __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
433 const __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, *(const __m128i*)(&was_infnan));
434 const __m128i sign = _mm_slli_epi32(justsign, 16);
435 const __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), *(const __m128*)(&exp_infnan));
436 const __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
437 const __m128 final = _mm_or_ps(scaled, sign_inf);
438
439 // ~11 SSE2 ops.
440 return final;
441
442 #undef SSE_CONST4
443
444 #endif
445 }
446
447 #endif // APPLESEED_USE_SSE
448
449 } // namespace foundation
450