1 /*
2  * This implementation is extracted from Eigen:
3  *   Repo: bitbucket.org/eigen/eigen
4  *   File: Eigen/src/Core/arch/CUDA/Half.h
5  *   Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4
6  *
7  * Removed a lot of redundant and cuda-specific code.
8  */
9 
10 #define EIGEN_STRONG_INLINE static inline
11 #define EIGEN_DEVICE_FUNC
12 
13 // This file is part of Eigen, a lightweight C++ template library
14 // for linear algebra.
15 //
16 // This Source Code Form is subject to the terms of the Mozilla
17 // Public License v. 2.0. If a copy of the MPL was not distributed
18 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
19 //
20 // The conversion routines are Copyright (c) Fabian Giesen, 2016.
21 // The original license follows:
22 //
23 // Copyright (c) Fabian Giesen, 2016
24 // All rights reserved.
25 // Redistribution and use in source and binary forms, with or without
26 // modification, are permitted.
27 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 // “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 
39 
40 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
41 // type Eigen::half (inheriting from CUDA's __half struct) with
42 // operator overloads such that it behaves basically as an arithmetic
43 // type. It will be quite slow on CPUs (so it is recommended to stay
44 // in fp32 for CPUs, except for simple parameter conversions, I/O
45 // to disk and the likes), but fast on GPUs.
46 
47 
48 #ifndef EIGEN_HALF_CUDA_H
49 #define EIGEN_HALF_CUDA_H
50 
51 namespace Eigen {
52 
53 namespace half_impl {
54 
55 // Make our own __half definition that is similar to CUDA's.
56 struct __half {
__half__half57   EIGEN_DEVICE_FUNC __half() : x(0) {}
__half__half58   explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
59   unsigned short x;
60 };
61 
62 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
63 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
64 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
65 
66 // Conversion routines, including fallbacks for the host or older CUDA.
67 // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
68 // these in hardware. If we need more performance on older/other CPUs, they are
69 // also possible to vectorize directly.
70 
raw_uint16_to_half(unsigned short x)71 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
72   __half h;
73   h.x = x;
74   return h;
75 }
76 
77 union FP32 {
78   unsigned int u;
79   float f;
80 };
81 
float_to_half_rtne(float ff)82 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
83 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
84   return __float2half(ff);
85 
86 #elif defined(EIGEN_HAS_FP16_C)
87   __half h;
88   h.x = _cvtss_sh(ff, 0);
89   return h;
90 
91 #else
92   FP32 f; f.f = ff;
93 
94   const FP32 f32infty = { 255 << 23 };
95   const FP32 f16max = { (127 + 16) << 23 };
96   const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
97   unsigned int sign_mask = 0x80000000u;
98   __half o;
99   o.x = static_cast<unsigned short>(0x0u);
100 
101   unsigned int sign = f.u & sign_mask;
102   f.u ^= sign;
103 
104   // NOTE all the integer compares in this function can be safely
105   // compiled into signed compares since all operands are below
106   // 0x80000000. Important if you want fast straight SSE2 code
107   // (since there's no unsigned PCMPGTD).
108 
109   if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
110     o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
111   } else {  // (De)normalized number or zero
112     if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
113       // use a magic value to align our 10 mantissa bits at the bottom of
114       // the float. as long as FP addition is round-to-nearest-even this
115       // just works.
116       f.f += denorm_magic.f;
117 
118       // and one integer subtract of the bias later, we have our final float!
119       o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
120     } else {
121       unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
122 
123       // update exponent, rounding bias part 1
124       f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
125       // rounding bias part 2
126       f.u += mant_odd;
127       // take the bits!
128       o.x = static_cast<unsigned short>(f.u >> 13);
129     }
130   }
131 
132   o.x |= static_cast<unsigned short>(sign >> 16);
133   return o;
134 #endif
135 }
136 
half_to_float(__half h)137 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
138 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
139   return __half2float(h);
140 
141 #elif defined(EIGEN_HAS_FP16_C)
142   return _cvtsh_ss(h.x);
143 
144 #else
145   const FP32 magic = { 113 << 23 };
146   const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
147   FP32 o;
148 
149   o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
150   unsigned int exp = shifted_exp & o.u;   // just the exponent
151   o.u += (127 - 15) << 23;                // exponent adjust
152 
153   // handle exponent special cases
154   if (exp == shifted_exp) {     // Inf/NaN?
155     o.u += (128 - 16) << 23;    // extra exp adjust
156   } else if (exp == 0) {        // Zero/Denormal?
157     o.u += 1 << 23;             // extra exp adjust
158     o.f -= magic.f;             // renormalize
159   }
160 
161   o.u |= (h.x & 0x8000) << 16;    // sign bit
162   return o.f;
163 #endif
164 }
165 
166 } // end namespace half_impl
167 
168 } // end namespace Eigen
169 
170 #endif // EIGEN_HALF_CUDA_H
171