1 /*
2 * This implementation is extracted from Eigen:
3 * Repo: bitbucket.org/eigen/eigen
4 * File: Eigen/src/Core/arch/CUDA/Half.h
5 * Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4
6 *
7 * Removed a lot of redundant and cuda-specific code.
8 */
9
10 #define EIGEN_STRONG_INLINE static inline
11 #define EIGEN_DEVICE_FUNC
12
13 // This file is part of Eigen, a lightweight C++ template library
14 // for linear algebra.
15 //
16 // This Source Code Form is subject to the terms of the Mozilla
17 // Public License v. 2.0. If a copy of the MPL was not distributed
18 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
19 //
20 // The conversion routines are Copyright (c) Fabian Giesen, 2016.
21 // The original license follows:
22 //
23 // Copyright (c) Fabian Giesen, 2016
24 // All rights reserved.
25 // Redistribution and use in source and binary forms, with or without
26 // modification, are permitted.
27 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 // “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39
40 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
41 // type Eigen::half (inheriting from CUDA's __half struct) with
42 // operator overloads such that it behaves basically as an arithmetic
43 // type. It will be quite slow on CPUs (so it is recommended to stay
44 // in fp32 for CPUs, except for simple parameter conversions, I/O
45 // to disk and the likes), but fast on GPUs.
46
47
48 #ifndef EIGEN_HALF_CUDA_H
49 #define EIGEN_HALF_CUDA_H
50
51 namespace Eigen {
52
53 namespace half_impl {
54
55 // Make our own __half definition that is similar to CUDA's.
56 struct __half {
__half__half57 EIGEN_DEVICE_FUNC __half() : x(0) {}
__half__half58 explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
59 unsigned short x;
60 };
61
62 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
63 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
64 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
65
66 // Conversion routines, including fallbacks for the host or older CUDA.
67 // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
68 // these in hardware. If we need more performance on older/other CPUs, they are
69 // also possible to vectorize directly.
70
raw_uint16_to_half(unsigned short x)71 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
72 __half h;
73 h.x = x;
74 return h;
75 }
76
77 union FP32 {
78 unsigned int u;
79 float f;
80 };
81
float_to_half_rtne(float ff)82 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
83 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
84 return __float2half(ff);
85
86 #elif defined(EIGEN_HAS_FP16_C)
87 __half h;
88 h.x = _cvtss_sh(ff, 0);
89 return h;
90
91 #else
92 FP32 f; f.f = ff;
93
94 const FP32 f32infty = { 255 << 23 };
95 const FP32 f16max = { (127 + 16) << 23 };
96 const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
97 unsigned int sign_mask = 0x80000000u;
98 __half o;
99 o.x = static_cast<unsigned short>(0x0u);
100
101 unsigned int sign = f.u & sign_mask;
102 f.u ^= sign;
103
104 // NOTE all the integer compares in this function can be safely
105 // compiled into signed compares since all operands are below
106 // 0x80000000. Important if you want fast straight SSE2 code
107 // (since there's no unsigned PCMPGTD).
108
109 if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
110 o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
111 } else { // (De)normalized number or zero
112 if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
113 // use a magic value to align our 10 mantissa bits at the bottom of
114 // the float. as long as FP addition is round-to-nearest-even this
115 // just works.
116 f.f += denorm_magic.f;
117
118 // and one integer subtract of the bias later, we have our final float!
119 o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
120 } else {
121 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
122
123 // update exponent, rounding bias part 1
124 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
125 // rounding bias part 2
126 f.u += mant_odd;
127 // take the bits!
128 o.x = static_cast<unsigned short>(f.u >> 13);
129 }
130 }
131
132 o.x |= static_cast<unsigned short>(sign >> 16);
133 return o;
134 #endif
135 }
136
half_to_float(__half h)137 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
138 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
139 return __half2float(h);
140
141 #elif defined(EIGEN_HAS_FP16_C)
142 return _cvtsh_ss(h.x);
143
144 #else
145 const FP32 magic = { 113 << 23 };
146 const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
147 FP32 o;
148
149 o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
150 unsigned int exp = shifted_exp & o.u; // just the exponent
151 o.u += (127 - 15) << 23; // exponent adjust
152
153 // handle exponent special cases
154 if (exp == shifted_exp) { // Inf/NaN?
155 o.u += (128 - 16) << 23; // extra exp adjust
156 } else if (exp == 0) { // Zero/Denormal?
157 o.u += 1 << 23; // extra exp adjust
158 o.f -= magic.f; // renormalize
159 }
160
161 o.u |= (h.x & 0x8000) << 16; // sign bit
162 return o.f;
163 #endif
164 }
165
166 } // end namespace half_impl
167
168 } // end namespace Eigen
169
170 #endif // EIGEN_HALF_CUDA_H
171