1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Floats.hpp"
10 #include "../strings.hpp"
11 
12 #if defined(_MSC_VER)
13 // This only affects defined(_DEBUG) and we could scope this to debug builds
14 // only, but we prefer to keep Debug and Release configs as close as possible,
15 // and formatting/disassembly isn't considered a performance critical path.
16 //
17 // There's a defect in Windows Debug runtime libraries in emitting denorms
18 // when denorm mode is FTZ by the calling library and with the Debug runtime
19 // https://developercommunity.visualstudio.com/content/problem/1187587/printf-assert-failure-unexpected-input-value-log10.html
20 //
21 // To test this place _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
22 // in before calling into this module (and compiled for Debug on MSVC++).
23 #define IGA_NEEDS_DENORM_WORKAROUND
24 #endif
25 
26 // For non-NaN cases this flag enables use of a native AVX instruction
27 // for half float conversion _cvtss_sh.
28 //
29 // #define IGA_USE_FP16C
30 
31 #include <cmath>
32 #include <cstdlib>
33 #include <cstdint>
34 #include <iomanip>
35 #include <iostream>
36 #include <limits>
37 #include <sstream>
38 #if defined(IGA_NEEDS_DENORM_WORKAROUND) || defined(IGA_USE_FP16C)
39 #include <immintrin.h>
40 #endif
41 
42 
43 using namespace iga;
44 
45 // Big Theory Statement (BTS): on NaN's in IGA
46 //
47 // NaN is an equivalence class of floating point values.  Different
48 // libraries, runtimes, and hardware use different elements of the
49 // class as the characteristic value when they want a NaN value.
50 // For instance, the sign bit can be anything and the mantissa payload
51 // bits can be anything except for the leading mantissa bit (which
52 // distinguishes a qnan from an snan).
53 //
54 // NOTE: we distinguish between 'qnan' and 'snan' instead of just using 'nan'
55 //       with the whole payload since we parse as 64b and only narrow once
56 //       we see the type; we considered 'nan' '(' HEXLIT ')' and just letting
57 //       the user sort the meaning of the mantissa out, but this gets messy
58 //       when widening literals during parse or narrowing them during
59 //       formatting.
60 //
61 // Without loss of generality (this generalizes to all IEEE sizes),
62 // the 32-bit IEEE 754 (C.f. section 6.2.1 of the spec) we have
63 //   s eeeeeeee mmm`mmmm`mmmm`mmmm`mmmm`mmmm
64 // (where the s and the m's can be anything such that at least one m is
65 // non-zero; all 0's would imply infinity)
66 //
67 // SNAN: "snan" (signaling NaN) is:
68 //   s 11111111 0xx`xxxx`xxxx`xxxx`xxxx`xxxx
69 //              ^ leading bit of mantissa is 0; at least one bit
70 //                of the rest of the payload (x's must be non-zero)
71 //
72 // QNAN: "qnan" (quiet NaN) is:
73 //   s 11111111 1xx`xxxx`xxxx`xxxx`xxxx`xxxx
74 //              ^ leading bit of mantiss is 1 and the rest can be anything
75 //                including all zeros since the top mantissa bit is set
76 //                (NaN only needs one mantissa bit set and the quiet bit
77 //                counts)
78 //
79 // IGA supports the following syntax (from examples)
80 //     NaNLit ::= '-'? ('snan'|'qnan') '(' HEXLIT ')'
81 //
82 // Examples:
83 //   * "-snan(0xA):f" parses as
84 //         s eeeeeeee mmm`mmmm`mmmm`mmmm`mmmm`mmmm
85 //         1 11111111 000`0000`0000`0000`0000`1010
86 //         ^ negative ^ snan                  ^^^^ 0xA payload
87 //           sign bit is respected
88 //
89 //   * "qnan(0x1B):f" parses as
90 //         s eeeeeeee mmm`mmmm`mmmm`mmmm`mmmm`mmmm
91 //         1 11111111 100`0000`0000`0000`0001`1011
92 //         ^ positive ^ qnan                ^^^^^^ 0x1B
93 //
94 //   * "snan(0x0):f" would be a parse error since the mantissa must not be 0
95 //         s eeeeeeee mmm`mmmm`mmmm`mmmm`mmmm`mmmm
96 //         1 11111111 000`0000`0000`0000`0000`0000
97 //         ^ positive ^ snan^^^^^^^^^^^^^^^^^^^^^^^ at least one bit must be 0
98 //
99 //   * "snan(0x00800000):f" illegal: payload value too large for bitfield
100 //         s eeeeeeee mmm`mmmm`mmmm`mmmm`mmmm`mmmm
101 //         1 11111111 000`0000`0000`0000`0000`0000
102 //                  ^ high bit of payload overflows to here
103 //
104 //   * "snan(0x00800000):df" payload fits since df has 53 bits
105 //         s eeeeeeeeeee m`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm`mmmm
106 //         1 11111111111 0`0000`0000`0000`0000`0000`0000`1000`0000`0000`0000`0000`0000
107 //                                                       ^ 0x800000
108 //
109 //   * "snan(0x400000):f" is legal, but just means qnan(0).  This is because
110 //      the parser parses the value as 64b and then converts it to 32b when it
111 //      sees the :f.  The payload fits while parsing, but conversion cannot
112 //      tell that we were dealing with qnan
113 
114 template <typename F> int FloatMantissaBits();
115 template <typename F> int FloatExponentBits();
FloatMantissaBits()116 template <> int FloatMantissaBits<double>() {return 52;}
FloatExponentBits()117 template <> int FloatExponentBits<double>() {return 11;}
FloatMantissaBits()118 template <> int FloatMantissaBits<float>() {return 23;}
FloatExponentBits()119 template <> int FloatExponentBits<float>() {return 8;}
FloatMantissaBits()120 template <> int FloatMantissaBits<uint16_t>() {return 10;}
FloatExponentBits()121 template <> int FloatExponentBits<uint16_t>() {return 5;}
122 
123 // E.g. FloatBias<float>() == 127
124 //
FloatBias()125 template <typename F> static int FloatBias() {
126     return (1 << (FloatExponentBits<F>() - 1)) - 1;
127 }
128 
129 // FloatBiasDiff<double,float> => 1023 - 127
130 // FloatBiasDiff<float,half>   => 127 - 15 (0x70)
FloatBiasDiff()131 template <typename FBIG,typename FSML> static int FloatBiasDiff() {
132     return FloatBias<FBIG>() - FloatBias<FSML>();
133 }
FloatMantissaBitsDiff()134 template <typename FBIG,typename FSML> static int FloatMantissaBitsDiff() {
135     return FloatMantissaBits<FBIG>() - FloatMantissaBits<FSML>();
136 }
137 
138 template <typename F, typename I> static
FormatFloatImplNaN(std::ostream & os,I bits)139 void FormatFloatImplNaN(std::ostream &os, I bits)
140 {
141     const int MANT_LEN = FloatMantissaBits<F>();
142     const int EXPN_LEN = FloatExponentBits<F>();
143     const I SIGN_BIT = ((I)1 << (MANT_LEN + EXPN_LEN));
144     if (bits & SIGN_BIT) {
145         os << '-';
146     }
147     // split the payload into the quiet/signaling bit
148     // (high bit of the mantissa) and the lower bits
149     const I QNAN_BIT = (I)1 << (MANT_LEN - 1);
150     if (bits & QNAN_BIT) {
151         os << "qnan";
152     } else {
153         os << "snan";
154     }
155     os << "(";
156     I lowerPayload = bits & (QNAN_BIT - 1); // lower bits of mantissa
157     fmtHex(os, (uint64_t)lowerPayload);
158     os << ")";
159 }
160 
161 // Checks if a floating point number will reparse exactly as formatted.
162 //
163 // We parse floats as 64b literals and cast down.  So we use strtod for all
164 // floating point types and cast down.
165 template <typename T>
willReparseExactly(T x,std::string str)166 static bool willReparseExactly(T x, std::string str)
167 {
168     // we always parse as a double since the parser will do the same
169     double y = strtod(str.c_str(), nullptr);
170     return ((T)y == x);
171 }
172 
173 #ifdef IGA_NEEDS_DENORM_WORKAROUND
174 // C.f. with the #define for this above for notes on this
175 struct ScopedDenormWorkaround {
176     unsigned int oldDenormMode;
ScopedDenormWorkaroundScopedDenormWorkaround177     ScopedDenormWorkaround()
178         : oldDenormMode(_MM_GET_DENORMALS_ZERO_MODE())
179     {
180         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
181     }
~ScopedDenormWorkaroundScopedDenormWorkaround182     ~ScopedDenormWorkaround() {
183         _MM_SET_DENORMALS_ZERO_MODE(oldDenormMode);
184     }
185 };
186 #endif
187 
188 // Formats a float in decimal or scientific format if it will not lose
189 // precision during reparse
190 //
191 // Returns false if we were able to format it as something representable.
192 template <typename F, typename I>
TryFormatFloatImplNonHex(std::ostream & os,F x)193 static bool TryFormatFloatImplNonHex(std::ostream &os, F x)
194 {
195 #ifdef IGA_NEEDS_DENORM_WORKAROUND
196     ScopedDenormWorkaround sdw;
197 #endif
198     auto fpc = std::fpclassify(x);
199     switch (fpc)
200     {
201     case FP_INFINITE:
202         if (x < 0) {
203             os << '-';
204         }
205         os << "inf";
206         return true;
207     case FP_NAN:
208         FormatFloatImplNaN<F,I>(os, iga::FloatToBits(x));
209         return true;
210     ///////////////////////////////////////////////////////////////////////////
211     // case FP_ZERO:
212     // case FP_NORMAL:
213     // case FP_SUBNORMAL:
214     default:
215         // Fall through and go through the pretty-print algorithm
216         break;
217     }
218 
219     // we try and render it several different ways until we find something
220     // that we can parse back bit-exact.
221     // We try:
222     //    - decimal
223     //    - exponential
224     //    - and then fall back on hex
225     // static_assert(sizeof(F) == sizeof(I));
226 
227     // try as default, this lets STL pick the format.
228     // it sometimes gives nice terse output
229     // e.g. "3" for "3.0" instead of "3.0000000000..." (when possible)
230     std::stringstream ss;
231     ss.unsetf(std::ios_base::floatfield);
232     ss << x;
233     if (willReparseExactly(x, ss.str())) {
234         auto str = ss.str();
235         os << str;
236         if (str.find('.') == std::string::npos &&
237             str.find('e') == std::string::npos &&
238             str.find('E') == std::string::npos)
239         {
240             // floats need a ".0" suffixing them if not in scientific form
241             // STL default float sometimes drops the .
242             //
243             //  e.g. given "-0.0f", if we were to format that
244             // as "-0:f" (which MSVCRT does), then it parses as
245             // "0" since this is the negation the S64 value 0 (during parse)
246             //
247             // NOTE: we also have to ensure that we aren't using an
248             // exponential (scientific) form already.
249             //
250             // e.g. 1e-007 should not convert to 1e-007.0
251             os << ".0";
252         }
253         return true;
254     }
255 
256     // try as scientific
257     ss.str(std::string()); // reset
258     ss << std::scientific << x;
259     if (willReparseExactly(x, ss.str())) {
260         os << ss.str();
261         return true;
262     }
263 
264     // TODO: IFDEF this given a new enough compiler
265     // (must parse too)
266     // e.g. 0x1.47ae147ae147bp-7 (need to parse first)
267     //   NOTE: parsing should use >>
268     //   float f;
269     //   e.g. std::istringstream(""0x1P-1022") >> std::hexfloat >> f;
270 
271     // fallback to hex integral
272     // FormatFloatImplHex<F,I>(os, x);
273     return false;
274 }
275 
276 template <typename F>
FormatFloatAsHex(std::ostream & os,F f)277 static void FormatFloatAsHex(std::ostream &os, F f)
278 {
279     fmtHex(os, (uint64_t)iga::FloatToBits(f));
280 }
281 
FormatFloat(std::ostream & os,float x)282 void iga::FormatFloat(std::ostream &os, float x)
283 {
284     if (!TryFormatFloatImplNonHex<float,uint32_t>(os, x)) {
285         FormatFloatAsHex<float>(os, x);
286     }
287 }
288 
FormatFloat(std::ostream & os,double x)289 void iga::FormatFloat(std::ostream &os, double x)
290 {
291     if (!TryFormatFloatImplNonHex<double,uint64_t>(os, x)) {
292         FormatFloatAsHex<double>(os, x);
293     }
294 }
295 
FormatFloat(std::ostream & os,uint16_t w16)296 void iga::FormatFloat(std::ostream &os, uint16_t w16)
297 {
298 #if 0
299     // this would turn off all non-hex floats
300     fmtHex(os, (uint64_t)w16);
301 #else
302     // trys to format a half float in a friendly format
303     // falling back to hex if all else fails
304     float f32 = ConvertHalfToFloat(w16);
305     if (IS_NAN(f32)) {
306         // So we get the correct payload size for NaNs
307         FormatFloatImplNaN<uint16_t,uint16_t>(os, w16);
308     } else if (ConvertFloatToHalf(f32) != w16 ||
309         !TryFormatFloatImplNonHex<float,uint32_t>(os, f32))
310     {
311         FormatFloatAsHex<uint16_t>(os, w16);
312     } // else: FormatFloatImplNonHex worked
313 #endif
314 }
315 
316 
FormatFloat(std::ostream & os,uint8_t x)317 void iga::FormatFloat(std::ostream &os, uint8_t x)
318 {
319     FormatFloat(os, ConvertQuarterToFloatGEN(x));
320 }
321 
ConvertDoubleToFloatBits(double f)322 uint32_t iga::ConvertDoubleToFloatBits(double f)
323 {
324     uint64_t f64 = FloatToBits(f);
325 
326     uint64_t m64 = f64 & F64_MANT_MASK;
327     uint64_t e64 = (f64 & F64_EXP_MASK) >> FloatMantissaBits<double>();
328     if (e64 == (F64_EXP_MASK >> FloatMantissaBits<double>()) && m64 != 0) {
329         // f64 NaN
330         uint32_t m32 = (uint32_t)m64 & F32_MANT_MASK;
331         m32 |= (uint32_t)((m64 & F64_QNAN_BIT) >>
332             (FloatMantissaBits<double>() - FloatMantissaBits<float>())); // preserve snan
333         if (m32 == 0) {
334             // The payload was only in the high bits which we dropped;
335             // make it non-zero so we retain NaN'ness
336             m32 = 1;
337         }
338         uint32_t s32 = (uint32_t)(f64 >> 32) & F32_SIGN_BIT;
339         return (s32 | F32_EXP_MASK | m32);
340     } else {
341         // regular conversion can deal with all the other special cases
342         return FloatToBits((float)f);
343     }
344 }
345 
ConvertDoubleToFloat(double f)346 float iga::ConvertDoubleToFloat(double f)
347 {
348     return FloatFromBits(ConvertDoubleToFloatBits(f));
349 }
350 
351 
ConvertFloatToHalf(float f)352 uint16_t iga::ConvertFloatToHalf(float f)
353 {
354     static const uint32_t F32_EXP_MASK =
355         ((1 << FloatExponentBits<float>()) - 1) << FloatMantissaBits<float>();
356 
357     const uint32_t w32 = FloatToBits(f);
358     const uint32_t w32_u = w32 & 0x7FFFFFFF;
359     const uint32_t sign = w32 & 0x80000000;
360     const uint16_t sign16 = (uint16_t)(sign >> 16);
361 
362     if (w32_u > F32_EXP_MASK) { // NaN
363         uint16_t m16 = 0;
364         m16 |= (F32_QNAN_BIT & w32_u) >> // preserve qnan bit
365             (FloatMantissaBits<float>() - FloatMantissaBits<uint16_t>());
366         m16 |= (F16_MANT_MASK >> 1) & w32_u; // and bottom 9b
367         //
368         // s eeeeeeee qmmmmmmmmmmmmmmmmmmmmm
369         //            |            |||||||||
370         //            |            vvvvvvvvv
371         //            +---------->qmmmmmmmmm
372         if (m16 == 0x0) {
373             // if the nonzero payload is in the high bits and and gets
374             // dropped and the signal bit is non-zero, then m16 is 0;
375             // to maintain it as a qnan we must set at least one bit
376             m16 = 0x1;
377         }
378         return sign16 | F16_EXP_MASK | m16;
379     } else if (w32_u == F32_EXP_MASK) { // +/-Infinity
380         return sign16 | F16_EXP_MASK;
381     } else {
382         // norm/denorm
383 #ifdef IGA_USE_FP16C
384         // should be on all 3rd generation or newer CPUs (HSW/SNB)
385         // _MM_FROUND_NO_EXC
386         const auto oldCsr = _mm_getcsr();
387         _mm_setcsr(oldCsr | _MM_FROUND_NO_EXC);
388         uint16_t w16 = (uint16_t)_mm_cvtsi128_si32(
389            _mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_TO_NEAREST_INT));
390         _mm_setcsr(oldCsr);
391         return w16;
392 #else // !IGA_USE_FP16C
393         static const uint32_t F32_BIAS = FloatBias<float>();
394         static const uint32_t F16_BIAS = FloatBias<uint16_t>();
395         static const int F32_MNT_BITS = FloatMantissaBits<float>();
396         static const int F16_MNT_BITS = FloatMantissaBits<uint16_t>();
397 
398         static const uint32_t LOWEST_OVERFLOW = // 0x47800000
399             (F32_BIAS + F16_BIAS + 1) << FloatMantissaBits<float>();
400         static const uint32_t LOWEST_NORM = // 0x38800000
401             (F32_BIAS - F16_BIAS + 1) << FloatMantissaBits<float>();
402         static const uint32_t LOWEST_DENORM = // 0x33000000
403             (F32_BIAS - F16_BIAS - (uint16_t)F16_MNT_BITS) << FloatMantissaBits<float>();
404         auto round = [](uint32_t v, uint32_t g, uint32_t s) {
405             return v + (g & (s | v));
406         };
407         if (w32_u >= LOWEST_OVERFLOW) { // overflows to infinity
408             return sign16 | F16_EXP_MASK;
409         } else if (w32_u >= LOWEST_NORM) { // fits as normalized half
410             uint32_t v =
411                 (((w32_u >> F32_MNT_BITS) - (F32_BIAS - F16_BIAS)) << F16_MNT_BITS) |
412                     ((w32_u >> (F32_MNT_BITS - F16_MNT_BITS)) & F16_MANT_MASK);
413             uint32_t g = (w32_u >> (F32_MNT_BITS - F16_MNT_BITS - 1)) & 0x1;
414             uint32_t s = (w32_u & 0x0FFF) ? 1 : 0;
415             return (uint16_t)round(sign16 | v, g, s);
416         } else if (w32_u >= LOWEST_DENORM) { // fits as normalized half
417             uint32_t i = (F32_BIAS - 1 - 1) - (w32_u >> F32_MNT_BITS);
418             uint32_t w32_u2 = (w32_u & F32_MANT_MASK) | (F32_MANT_MASK + 1);
419             uint32_t v = sign16 | (w32_u2 >> (i + 1));
420             uint32_t g = (w32_u2 >> i) & 1;
421             uint32_t s = (w32_u2 & ((1 << i) - 1)) ? 1 : 0;
422             return (uint16_t)round(v, g, s);
423         } else {
424             // underflow to +-0
425             return sign16;
426         }
427 #endif // !IGA_USE_FP16C
428     }
429 }
430 
431 
432 
433 // GEN's 8-bit restricted float ("quarter float")
434 //   s eee mmmm (bias 2^(3-1) - 1 == 3)
435 //  s gfe dcba
436 // mantissa shifted to the top of the float
437 //     dcb`a000`0000`0000`0000`0000
438 // exponenent is unpacked by expanding
439 //    gfe to the following:
440 //
441 //    dgGGGGfe where G = ~g
442 // negative and positive 0 bypass this logic
443 //
444 // => NaN and infinities are illegal in this format
445 // => Denorms are not supported
446 //     QUOTE:
447 //      Specifically, when the exponent field is zero and the fraction
448 //      field is not zero, an implied one is still present instead of
449 //      taking a denormalized form (without an implied one).  This results
450 //      in a simple implementation but with a smaller dynamic range -
451 //      the magnitude of the smallest non-zero number is 0.1328125.
ConvertQuarterToFloatGEN(uint8_t u8)452 float iga::ConvertQuarterToFloatGEN(uint8_t u8)
453 {
454     if (u8 == 0x00) {
455         return 0.0f;
456     } else if (u8 == 0x80) {
457         return -0.0f;
458     } else {
459         uint32_t f32;
460         f32 = ((uint32_t)u8 & 0x80) << (32 - 8); // d: sign
461         f32 |= ((0x30 & (uint32_t)u8) << (23 - 4)); // fe: low bits of exp
462         if ((0x40 & u8) == 0) { // g = 0
463             // exp=011111fe
464             f32 |= 0x1F << (23 + 2);
465         } else {
466             // exp=100000fe
467             f32 |= 1ul << (23 + 7); // f32 high exp bit
468         }
469         f32 |= ((uint32_t)u8 & 0xF) << (23 - 4); // dcba: mantissa
470         return FloatFromBits(f32);
471     }
472 }
473 
IsNaN(uint16_t u16)474 bool iga::IsNaN(uint16_t u16)
475 {
476     return
477         (F16_EXP_MASK & u16) == F16_EXP_MASK &&
478         (F16_MANT_MASK & u16) != 0;
479 }
480 
IsInf(uint16_t u16)481 bool iga::IsInf(uint16_t u16)
482 {
483     return
484         (F16_EXP_MASK & u16) == F16_EXP_MASK &&
485         (F16_MANT_MASK & u16) == 0;
486 }
487 
488 
ConvertHalfToFloat(uint16_t u16)489 float iga::ConvertHalfToFloat(uint16_t u16)
490 {
491     uint16_t u16_u = u16 & 0x7FFF;
492     uint32_t s32 = ((uint32_t)u16 & F16_SIGN_BIT) << 16;
493     uint32_t m16 = u16 & F16_MANT_MASK;
494     if (u16_u > F16_EXP_MASK) {
495         // NaN
496         uint32_t m32 = (u16 & F16_QNAN_BIT) <<
497             FloatMantissaBitsDiff<float,uint16_t>(); // preserve sNaN bit
498         m32 |= (F16_MANT_MASK >> 1) & m16;
499         if (m32 == 0) {
500             m32 = 1; // ensure still NaN
501         }
502         return FloatFromBits(s32 | F32_EXP_MASK | m32);
503     }
504 #ifdef IGA_USE_FP16C
505     float f = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(u16)));
506     return f;
507 #else // !IGA_USE_FP16C
508     uint32_t e16 = (u16 & F16_EXP_MASK) >> FloatMantissaBits<uint16_t>();
509     uint32_t e32, m32;
510     if (u16_u == F16_EXP_MASK) {
511         // +-infinity
512         e32 = F32_EXP_MASK >> FloatMantissaBits<float>();
513         m32 = 0;
514     } else if (e16 != 0 && e16 < 0x1F) {
515         //  normal number
516         e32 = e16 + FloatBiasDiff<float,uint16_t>(); // (127 - 15); // 0x70
517         m32 = m16 << FloatMantissaBitsDiff<float,uint16_t>(); // (23 - 10);
518     } else if (e16 == 0 && m16 != 0) {
519         // denorm/subnorm number (e16 == 0) => renormalize it
520         // shift the mantissa left until the hidden one gets set
521         for (e32 = FloatBiasDiff<float,uint16_t>() + 1;
522             (m16 & (F16_MANT_MASK + 1)) == 0;
523             m16 <<= 1, e32--)
524             ;
525         m32 = (m16 << FloatMantissaBitsDiff<float,uint16_t>()) & F32_MANT_MASK;
526     } else { // if (e16 == 0) // +/- 0.0
527         e32 = 0;
528         m32 = 0;
529     }
530     return FloatFromBits(s32 | (e32 << FloatMantissaBits<float>()) | m32);
531 #endif // !IGA_USE_FP16C
532 }
533 
ConvertFloatToDouble(float f)534 double iga::ConvertFloatToDouble(float f)
535 {
536     if (IS_NAN(f)) {
537         uint32_t f32 = FloatToBits(f);
538 
539         uint64_t m64;
540         m64 = (uint64_t)(f32 & F32_QNAN_BIT) <<
541             (FloatMantissaBits<double>() - FloatMantissaBits<float>()); // keep the sNaN bit
542         m64 |= (F32_MANT_MASK >> 1) & f32; // keep the non sNaN part
543                                                // lower part of the payload
544         uint64_t bits =
545             (((uint64_t)f32 & F32_SIGN_BIT) << 32) | // sign
546             F64_EXP_MASK |                           // exp
547             m64;                                         // new mantissa
548         return FloatFromBits(bits);
549     } else {
550         // not NaN: use default value
551         return (double)f;
552     }
553 }
554 
555 
556 
ParseFLTLIT(const std::string & syntax,double & value)557 bool iga::ParseFLTLIT(const std::string &syntax, double &value)
558 {
559     char *end = nullptr;
560     value = std::strtod(syntax.c_str(), &end);
561     if (*end) {
562         return false;
563     }
564     return true;
565 }
566