1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #ifndef IGA_FLOATS_HPP 10 #define IGA_FLOATS_HPP 11 12 #include <cstdint> 13 #include <cmath> 14 #include <string> 15 #include <iostream> 16 17 // Provides utilities for dealing with floating point numbers including some 18 // minimal fp16 support. 19 20 #if !defined(_WIN32) || (_MSC_VER >= 1800) 21 // GCC and VS2013 and higher support these 22 #define IS_NAN(X) std::isnan(X) 23 #define IS_INF(X) std::isinf(X) 24 #else 25 #define IS_NAN(X) ((X) != (X)) 26 #define IS_INF(X) (!IS_NAN(X) && IS_NAN((X) - (X))) 27 #endif 28 29 namespace iga { 30 31 // formats a floating point value in decimal if possible 32 // otherwise it falls back to hex 33 void FormatFloat(std::ostream &os, double d); 34 void FormatFloat(std::ostream &os, float f); 35 void FormatFloat(std::ostream &os, uint16_t h); 36 void FormatFloat(std::ostream &os, uint8_t q); // GEN's 8-bit restricted float 37 38 // These functions exist since operations on NaN values might change the NaN 39 // payload. E.g. An sNan might convert to a qNan during a cast 40 float ConvertDoubleToFloat(double d); 41 uint32_t ConvertDoubleToFloatBits(double d); 42 uint16_t ConvertFloatToHalf(float f); 43 static inline ConvertDoubleToHalf(double d)44uint16_t ConvertDoubleToHalf(double d) { 45 return ConvertFloatToHalf(ConvertDoubleToFloat(d)); 46 } 47 float ConvertHalfToFloat(uint16_t u16); 48 double ConvertFloatToDouble(float f32); 49 50 // This expands Intel GEN's restricted 8-bit format 51 float ConvertQuarterToFloatGEN(uint8_t u8); 52 53 54 // Various raw accessors to convert between bits and float FloatToBits(uint16_t f)55static inline uint16_t FloatToBits(uint16_t f) {return f;} FloatToBits(double f)56static inline uint64_t FloatToBits(double f) { 57 union{double f; uint64_t i;} u; 58 u.f = f; 59 return u.i; 60 } FloatToBits(float f)61static inline uint32_t FloatToBits(float f) { 62 union{float f; uint32_t i;} u; 63 u.f = f; 64 return u.i; 65 } 66 FloatFromBits(uint16_t f)67static inline uint16_t FloatFromBits(uint16_t f) {return f;} FloatFromBits(uint32_t f)68static inline float FloatFromBits(uint32_t f) { 69 union{float f; uint32_t i;} u; 70 u.i = f; 71 return u.f; 72 } FloatFromBits(uint64_t f)73static inline double FloatFromBits(uint64_t f) { 74 union{double f; uint64_t i;} u; 75 u.i = f; 76 return u.f; 77 } 78 79 bool IsNaN(uint16_t u16); 80 bool IsInf(uint16_t u16); 81 82 83 static const uint64_t F64_SIGN_BIT = 0x8000000000000000ull; 84 static const uint64_t F64_EXP_MASK = 0x7FF0000000000000ull; 85 static const uint64_t F64_MANT_MASK = 0x000FFFFFFFFFFFFFull; 86 static const uint64_t F64_QNAN_BIT = 0x0008000000000000ull; 87 static const uint32_t F32_SIGN_BIT = 0x80000000; 88 static const uint32_t F32_EXP_MASK = 0x7F800000; 89 static const uint32_t F32_MANT_MASK = 0x007FFFFF; 90 static const uint32_t F32_QNAN_BIT = 0x00400000; 91 static const uint16_t F16_SIGN_BIT = 0x8000; 92 static const uint16_t F16_EXP_MASK = 0x7C00; 93 static const uint16_t F16_MANT_MASK = 0x03FF; 94 static const uint16_t F16_QNAN_BIT = 0x0200; 95 96 // Parses the lexical FLTLIT pattern to into a double 97 bool ParseFLTLIT(const std::string &string, double &d); 98 99 } // namespace iga 100 101 #endif // IGA_FLOATS_HPP 102