1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #ifndef IGA_FLOATS_HPP
10 #define IGA_FLOATS_HPP
11 
12 #include <cstdint>
13 #include <cmath>
14 #include <string>
15 #include <iostream>
16 
17 // Provides utilities for dealing with floating point numbers including some
18 // minimal fp16 support.
19 
20 #if !defined(_WIN32) || (_MSC_VER >= 1800)
21 // GCC and VS2013 and higher support these
22 #define IS_NAN(X) std::isnan(X)
23 #define IS_INF(X) std::isinf(X)
24 #else
25 #define IS_NAN(X) ((X) != (X))
26 #define IS_INF(X) (!IS_NAN(X) && IS_NAN((X) - (X)))
27 #endif
28 
29 namespace iga {
30 
31 // formats a floating point value in decimal if possible
32 // otherwise it falls back to hex
33 void FormatFloat(std::ostream &os, double d);
34 void FormatFloat(std::ostream &os, float f);
35 void FormatFloat(std::ostream &os, uint16_t h);
36 void FormatFloat(std::ostream &os, uint8_t q); // GEN's 8-bit restricted float
37 
38 // These functions exist since operations on NaN values might change the NaN
39 // payload.  E.g. An sNan might convert to a qNan during a cast
40 float     ConvertDoubleToFloat(double d);
41 uint32_t  ConvertDoubleToFloatBits(double d);
42 uint16_t  ConvertFloatToHalf(float f);
43 static inline
ConvertDoubleToHalf(double d)44 uint16_t  ConvertDoubleToHalf(double d) {
45     return ConvertFloatToHalf(ConvertDoubleToFloat(d));
46 }
47 float     ConvertHalfToFloat(uint16_t u16);
48 double    ConvertFloatToDouble(float f32);
49 
50 // This expands Intel GEN's restricted 8-bit format
51 float     ConvertQuarterToFloatGEN(uint8_t u8);
52 
53 
54 // Various raw accessors to convert between bits and float
FloatToBits(uint16_t f)55 static inline uint16_t FloatToBits(uint16_t f) {return f;}
FloatToBits(double f)56 static inline uint64_t FloatToBits(double f) {
57     union{double f; uint64_t i;} u;
58     u.f = f;
59     return u.i;
60 }
FloatToBits(float f)61 static inline uint32_t FloatToBits(float f) {
62     union{float f; uint32_t i;} u;
63     u.f = f;
64     return u.i;
65 }
66 
FloatFromBits(uint16_t f)67 static inline uint16_t FloatFromBits(uint16_t f) {return f;}
FloatFromBits(uint32_t f)68 static inline float FloatFromBits(uint32_t f) {
69     union{float f; uint32_t i;} u;
70     u.i = f;
71     return u.f;
72 }
FloatFromBits(uint64_t f)73 static inline double FloatFromBits(uint64_t f) {
74     union{double f; uint64_t i;} u;
75     u.i = f;
76     return u.f;
77 }
78 
79 bool IsNaN(uint16_t u16);
80 bool IsInf(uint16_t u16);
81 
82 
83 static const uint64_t F64_SIGN_BIT  = 0x8000000000000000ull;
84 static const uint64_t F64_EXP_MASK  = 0x7FF0000000000000ull;
85 static const uint64_t F64_MANT_MASK = 0x000FFFFFFFFFFFFFull;
86 static const uint64_t F64_QNAN_BIT  = 0x0008000000000000ull;
87 static const uint32_t F32_SIGN_BIT  = 0x80000000;
88 static const uint32_t F32_EXP_MASK  = 0x7F800000;
89 static const uint32_t F32_MANT_MASK = 0x007FFFFF;
90 static const uint32_t F32_QNAN_BIT  = 0x00400000;
91 static const uint16_t F16_SIGN_BIT  = 0x8000;
92 static const uint16_t F16_EXP_MASK  = 0x7C00;
93 static const uint16_t F16_MANT_MASK = 0x03FF;
94 static const uint16_t F16_QNAN_BIT  = 0x0200;
95 
96 // Parses the lexical FLTLIT pattern to into a double
97 bool ParseFLTLIT(const std::string &string, double &d);
98 
99 } // namespace iga
100 
101 #endif // IGA_FLOATS_HPP
102