1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #pragma once
10 
11 #include <limits.h>
12 #include <cmath>
13 
14 namespace iSTD
15 {
16 /*****************************************************************************\
17 Constants:
18     FPU_FLOAT32_*
19 
20 Description:
21     Binary representation of 32-bit floating point specials.
22     FPU_FLOAT32_COMPUTE special value can be used in result tables to mark
23     cases, where final value should be computed normally.
24 \*****************************************************************************/
25 const DWORD FPU_FLOAT32_NAN         = 0x7FFFFFFF;
26 const DWORD FPU_FLOAT32_NEG_INF     = 0xFF800000;
27 const DWORD FPU_FLOAT32_POS_INF     = 0x7F800000;
28 const DWORD FPU_FLOAT32_NEG_ZERO    = 0x80000000;
29 const DWORD FPU_FLOAT32_POS_ZERO    = 0x00000000;
30 const DWORD FPU_FLOAT32_COMPUTE     = 0xFFFFFFFF;
31 const DWORD FPU_FLOAT32_ONE         = (DWORD) 0x3F800000;
32 const DWORD FPU_FLOAT32_MINUS_ONE   = (DWORD) 0xBF800000;
33 
34 
35 /*****************************************************************************\
36 Enumeration:
37     FPU_FLOAT_CLASS
38 
39 Description:
40     Classes of floating point numbers.
41     (+0, -0, +finite, -finite, +Inf, -Inf, NaN, -denorm, +denorm)
42 \*****************************************************************************/
43 enum FPU_FLOAT_CLASS {
44     FPU_FLOAT_CLASS_NEG_INF      = 0,
45     FPU_FLOAT_CLASS_NEG_FINITE   = 1,
46     FPU_FLOAT_CLASS_NEG_DENORM   = 2,
47     FPU_FLOAT_CLASS_NEG_ZERO     = 3,
48     FPU_FLOAT_CLASS_POS_ZERO     = 4,
49     FPU_FLOAT_CLASS_POS_DENORM   = 5,
50     FPU_FLOAT_CLASS_POS_FINITE   = 6,
51     FPU_FLOAT_CLASS_POS_INF      = 7,
52     FPU_FLOAT_CLASS_NAN          = 8,
53     NUM_FPU_FLOAT_CLASSES        = 9
54 };
55 
56 /*****************************************************************************\
57 Inline Function:
58     Float32GetClass
59 
60 Description:
61     Returns class (+0, -0, +finite, -finite, +Inf, -Inf, NaN) of 32-bit float.
62 \*****************************************************************************/
Float32GetClass(const float f)63 inline FPU_FLOAT_CLASS Float32GetClass( const float f )
64 {
65     FLOAT32 f32;
66     f32.value.f = f;
67 
68     switch( f32.value.u )
69     {
70     case FPU_FLOAT32_POS_ZERO:  return FPU_FLOAT_CLASS_POS_ZERO;
71     case FPU_FLOAT32_NEG_ZERO:  return FPU_FLOAT_CLASS_NEG_ZERO;
72     case FPU_FLOAT32_POS_INF:   return FPU_FLOAT_CLASS_POS_INF;
73     case FPU_FLOAT32_NEG_INF:   return FPU_FLOAT_CLASS_NEG_INF;
74     default:                    break;
75     }
76 
77     if( f32.exponent == 0xFF )
78     {
79         return FPU_FLOAT_CLASS_NAN;
80     }
81     else if( f32.exponent == 0x00 )
82     {
83         if( f32.sign == 0 )
84         {
85             return FPU_FLOAT_CLASS_POS_DENORM;
86         }
87         else
88         {
89             return FPU_FLOAT_CLASS_NEG_DENORM;
90         }
91     }
92 
93     if( f32.sign )
94     {
95         return FPU_FLOAT_CLASS_NEG_FINITE;
96     }
97 
98     return FPU_FLOAT_CLASS_POS_FINITE;
99 }
100 
101 /*****************************************************************************\
102 Inline Function:
103     Float32IsInfinity
104 
105 Description:
106     Returns true if class is +Inf or -Inf of 32-bit float.
107 \*****************************************************************************/
Float32IsInfinity(const float f)108 inline bool Float32IsInfinity( const float f )
109 {
110     FPU_FLOAT_CLASS fClass = Float32GetClass( f );
111 
112     return ( fClass == FPU_FLOAT_CLASS_POS_INF ) ||
113            ( fClass == FPU_FLOAT_CLASS_NEG_INF );
114 }
115 
116 /*****************************************************************************\
117 Inline Function:
118     Float32IsDenorm
119 
120 Description:
121     Returns true if class is +Denorm or -Denorm.
122 \*****************************************************************************/
Float32IsDenorm(const float f)123 inline bool Float32IsDenorm( const float f )
124 {
125     FPU_FLOAT_CLASS fClass = Float32GetClass( f );
126 
127     return ( fClass == FPU_FLOAT_CLASS_NEG_DENORM ) ||
128            ( fClass == FPU_FLOAT_CLASS_POS_DENORM );
129 }
130 
131 /*****************************************************************************\
132 
133 Inline Function:
134     Float32IsFinite
135 
136 Description:
137     Returns true if f is finite: not +/-INF, and not NaN.
138 \*****************************************************************************/
Float32IsFinite(const float f)139 inline bool Float32IsFinite( const float f )
140 {
141     FPU_FLOAT_CLASS fClass = Float32GetClass( f );
142 
143     return ( fClass != FPU_FLOAT_CLASS_NAN )     &&
144            ( fClass != FPU_FLOAT_CLASS_NEG_INF ) &&
145            ( fClass != FPU_FLOAT_CLASS_POS_INF );
146 }
147 
148 /*****************************************************************************\
149 Inline Function:
150     IsFPZero
151 
152 Description:
153     Returns true if the argument x seen as a 32-bit IEEE754 floating point
154     number is either positive or negative zero  +0.0, -0.0.
155 
156 Input:
157     dword value that will be interpreted as a binary32 representation
158     of single-precision floating point value.
159 
160 Output:
161     True if the value represents either positive or negative float zero.
162 
163 \*****************************************************************************/
IsFPZero(const DWORD x)164 inline bool IsFPZero( const DWORD x )
165 {
166     return ( x == iSTD::FPU_FLOAT32_POS_ZERO ) ||
167            ( x == iSTD::FPU_FLOAT32_NEG_ZERO );
168 }
169 
170 /*****************************************************************************\
171 Inline Function:
172     Float32SafeAdd
173 
174 Description:
175     Performs addition taking care of floating point specials in software.
176 \*****************************************************************************/
Float32SafeAdd(const float arg1,const float arg2,const bool denormRetain)177 inline float Float32SafeAdd( const float arg1, const float arg2, const bool denormRetain  )
178 {
179     // Table for handling IEEE 754 specials in addition
180     //
181     //  a + b       -Inf    -X      -0      +0      +X      +Inf    NaN
182     //
183     //  -Inf        -Inf    -Inf    -Inf    -Inf    -Inf    NaN     NaN
184     //  -X          -Inf    <add>   <add>   <add>   <add>   +Inf    NaN
185     //  -0          -Inf    <add>   -0      +0      <add>   +Inf    NaN
186     //  +0          -Inf    <add>   +0      +0      <add>   +Inf    NaN
187     //  +X          -Inf    <add>   <add>   <add>   <add>   +Inf    NaN
188     //  +Inf        NaN     +Inf    +Inf    +Inf    +Inf    +Inf    NaN
189     //  NaN         NaN     NaN     NaN     NaN     NaN     NaN     NaN
190     //
191 
192     static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
193     //    -Inf                  -X                    -denorm               -0                    +0                    +denorm               +X                    +Inf                  NaN
194         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF  , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // -Inf
195         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE  , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // -X
196         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // -denorm
197         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // -0
198         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +0
199         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +denorm
200         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE  , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +X
201         { FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF  , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +Inf
202         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // NaN
203     };
204 
205     const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
206     const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
207 
208     FLOAT32 f32;
209     f32.value.u = RESULT[ t1 ][ t2 ];
210 
211     bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
212 
213     if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
214     {
215         return arg1 + arg2;
216     }
217 
218     return f32.value.f;
219 }
220 
221 /*****************************************************************************\
222 Inline Function:
223     Float32SafeSubtract
224 
225 Description:
226     Performs subtraction taking care of floating point specials in software.
227 \*****************************************************************************/
Float32SafeSubtract(const float arg1,const float arg2,const bool denormRetain)228 inline float Float32SafeSubtract( const float arg1, const float arg2, const bool denormRetain )
229 {
230     FLOAT32 f32;
231     f32.value.f = arg2;
232 
233     // flip sign bit
234     f32.sign ^= 1;
235 
236     return Float32SafeAdd( arg1, f32.value.f, denormRetain );
237 }
238 
239 /*****************************************************************************\
240 Inline Function:
241     Float32SafeMultiply
242 
243 Description:
244     Performs multiplication taking care of floating point specials in software.
245 \*****************************************************************************/
Float32SafeMultiply(const float arg1,const float arg2,const bool denormRetain)246 inline float Float32SafeMultiply( const float arg1, const float arg2, const bool denormRetain )
247 {
248     // Table for handling IEEE 754 specials in multiplication
249     //
250     //  a * b       -Inf    -X      -0      +0      +X      +Inf    NaN
251     //
252     //  -Inf        +Inf    +Inf    NaN     NaN     -Inf    -Inf    NaN
253     //  -X          +Inf    <mul>   +0      -0      <mul>   -Inf    NaN
254     //  -0          NaN     +0      +0      -0      -0      NaN     NaN
255     //  +0          NaN     -0      -0      +0      +0      NaN     NaN
256     //  +X          -Inf    <mul>   -0      +0      <mul>   +Inf    NaN
257     //  +Inf        -Inf    -Inf    NaN     NaN     +Inf    +Inf    NaN
258     //  NaN         NaN     NaN     NaN     NaN     NaN     NaN     NaN
259     //
260 
261     static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
262     //    -Inf                  -X                    -denorm               -0                    +0                    +denorm               +X                    +Inf                  NaN
263         { FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN      },  // -Inf
264         { FPU_FLOAT32_POS_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN      },  // -X
265         { FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // -denorm
266         { FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // -0
267         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // +0
268         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // +denorm
269         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +X
270         { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN      },  // +Inf
271         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // NaN
272     };
273 
274     FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
275     FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
276 
277     FLOAT32 f32;
278     f32.value.u = RESULT[ t1 ][ t2 ];
279 
280     bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
281 
282     if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
283     {
284         return arg1 * arg2;
285     }
286 
287     return f32.value.f;
288 }
289 
290 /*****************************************************************************\
291 Inline Function:
292     Float32SafeFMA
293 
294 Description:
295     Performs fused mutliply and add taking care of floating point specials in
296     software.
297 
298     This is machine generated code provided by SSG.
299 
300 \*****************************************************************************/
Float32SafeFMA(const float a,const float b,const float c)301 inline float Float32SafeFMA( const float a, const float b, const float c )
302 {
303     const DWORD _own_large_value_32[] = { 0x71800000, 0xf1800000 };
304     const DWORD _own_small_value_32[] = { 0x0d800000, 0x8d800000 };
305     const DWORD _ones[]               = { 0x3f800000, 0xbf800000 };
306 
307     DWORD ux = 0;
308     DWORD uy = 0;
309     DWORD uz = 0;
310     DWORD ur = 0;
311     DWORD xbits = 0;
312     DWORD ybits = 0;
313     DWORD zbits = 0;
314     DWORD uhi = 0;
315     DWORD ulo = 0;
316     DWORD vhi = 0;
317     DWORD vlo = 0;
318     DWORD remain = 0;
319     DWORD temp = 0;
320     DWORD L_mask = 0;
321     DWORD R_mask = 0;
322 
323     INT zsign = 0;
324     INT rsign = 0;
325     INT xexp = 0;
326     INT yexp = 0;
327     INT zexp = 0;
328     INT rexp = 0;
329     INT carry = 0;
330     INT borrow = 0;
331     INT rm = 0;
332     INT shift = 0;
333     INT L_shift = 0;
334     INT R_shift = 0;
335 
336     UINT64 ubits = 0;
337     float resultf = 0;
338     float tv = 0;
339     float x = a;
340     float y = b;
341     float z = c;
342 
343     // Set to round to nearest even.
344     rm = 0;
345 
346     ux = FLOAT32( x >= 0.0f ? x : -x ).value.u;
347     uy = FLOAT32( y >= 0.0f ? y : -y ).value.u;;
348     uz = FLOAT32( z >= 0.0f ? z : -z ).value.u;;
349 
350     int cond1 = ( ux == 0 ) |
351         ( ux >= 0x7f800000 ) |
352         ( ux == 0x3f800000 ) |
353         ( uy == 0 ) |
354         ( uy >= 0x7f800000 ) |
355         ( uy == 0x3f800000 ) |
356         ( uz == 0 ) |
357         ( uz >= 0x7f800000 );
358 
359     if( cond1 != 0 )
360     {
361         if(  Float32IsInfinity( z ) &&
362             !Float32IsInfinity( x ) &&
363             !Float32IsInfinity( y ) )
364         {
365             resultf = ( z + x ) + y;
366         }
367         else
368         {
369             resultf = x * y + z;
370         }
371 
372         return resultf;
373     }
374 
375     xexp = (int)( ux >> 23 );
376     yexp = (int)( uy >> 23 );
377     zexp = (int)( uz >> 23 );
378 
379     xbits = 0x00800000 | ( ux & 0x007fffff );
380     ybits = 0x00800000 | ( uy & 0x007fffff );
381     zbits = 0x00800000 | ( uz & 0x007fffff );
382 
383 
384     rsign = ( FLOAT32(x).value.s ^ FLOAT32(y).value.s ) & 0x80000000;
385     rexp  = ( xexp + yexp ) - 0x7F;
386     ubits = (UINT64)xbits * ybits;
387 
388     if( (DWORD) ( ubits >> 32 ) & 0x00008000 )
389     {
390         uhi = (DWORD)( ubits >> 24 );
391         ulo = ( (DWORD)ubits << 8 );
392         rexp++;
393     }
394     else
395     {
396         uhi = (DWORD)( ubits >> 23 );
397         ulo = ( (DWORD)ubits << 9 );
398     }
399 
400     int cond2 = ( rexp > zexp ) |
401                 ( ( rexp == zexp ) & ( uhi >= zbits ) );
402 
403     if( cond2 != 0 )
404     {
405         shift = ( rexp - zexp );
406         vhi = zbits;
407         vlo = 0;
408         zsign = FLOAT32(z).value.s & 0x80000000;
409     }
410     else
411     {
412         shift = ( zexp - rexp );
413         rexp = zexp;
414         vhi = uhi;
415         vlo = ulo;
416         uhi = zbits;
417         ulo = 0;
418         zsign = rsign;
419         rsign = FLOAT32(z).value.s & 0x80000000;
420     }
421 
422     remain = 0;
423     if( shift != 0 )
424     {
425         if( shift < 32 )
426         {
427             L_shift = 32 - shift;
428             R_shift = shift - 0;
429             L_mask = ~( 0xffffffffu >> R_shift );
430             remain = ( vlo << L_shift );
431             vlo = ( ( vhi << L_shift ) & L_mask) | ( vlo >> R_shift );
432             vhi = ( vhi >> R_shift );
433         }
434         else if( shift < 64 )
435         {
436             L_shift = 64 - shift;
437             R_shift = shift - 32;
438             L_mask = ~( 0xffffffffu >> R_shift );
439             remain = ( ( vhi << L_shift ) & L_mask ) | ( vlo != 0 );
440             vlo = ( vhi >> R_shift );
441             vhi = 0;
442         }
443         else
444         {
445             remain = ( vhi | vlo ) != 0;
446             vhi = vlo = 0;
447         }
448     }
449 
450     if( rsign == zsign )
451     {
452         temp = ulo;
453         ulo += vlo;
454         carry = ( ulo < temp );
455         uhi += ( vhi + carry );
456 
457         if ( uhi & 0x01000000 )
458         {
459             remain = ( uhi << 31 ) | ( ( ulo | remain ) != 0 );
460             ur = ( uhi >> 1 ) & 0x007fffff;
461             rexp += 1;
462         }
463         else
464         {
465             remain = ulo | ( remain != 0 );
466             ur = (uhi & 0x007fffff);
467         }
468     }
469     else
470     {
471         remain = ( 0 - remain );
472         borrow = ( remain != 0 );
473         temp = ulo;
474         ulo -= borrow;
475         borrow = ( ulo > temp );
476         uhi -= borrow;
477         temp = ulo;
478         ulo -= vlo;
479         borrow = ( ulo > temp );
480         uhi -= borrow;
481         uhi -= vhi;
482 
483         if( uhi != 0 )
484         {
485             temp = ( uhi << 8 );
486             shift = 0;
487         }
488         else if( ulo != 0 )
489         {
490             temp = ulo;
491             shift = 24;
492         }
493         else if( remain != 0 )
494         {
495             temp = remain;
496             shift = 24 + 32;
497         }
498         else
499         {
500             return FLOAT32( (DWORD)0x00000000 ).value.f;
501         }
502 
503         shift += clz( temp );
504 
505         if( shift < 32 )
506         {
507             L_shift = shift - 0;
508             R_shift = 32 - shift;
509             R_mask = ( (DWORD) 1 << L_shift ) - 1;
510             ur = ( ( uhi << L_shift ) | (( ulo >> R_shift ) & R_mask ) ) & 0x007fffff;
511             remain = ( ulo << L_shift ) | ( remain != 0 );
512         }
513         else if( shift < 64 )
514         {
515             L_shift = shift - 32;
516             R_shift = 64 - shift;
517             R_mask = ( (DWORD) 1 << L_shift ) - 1;
518             ur = ( ( ulo << L_shift ) | ( ( remain >> R_shift ) & R_mask ) ) & 0x007fffff;
519             remain = ( remain << L_shift );
520         }
521         else
522         {
523             L_shift = shift - 64;
524             ur = ( remain << L_shift ) & 0x007fffff;
525             remain = 0;
526         }
527         rexp -= shift;
528     }
529 
530     if( (DWORD) rexp - 1 >= 0xFF - 1 )
531     {
532         if( rexp >= 0xFF )
533         {
534             rsign = ( (DWORD)rsign >> 31 );
535             if( rsign )
536             {
537                 resultf = tv = FLOAT32(_own_large_value_32[(1)]).value.f * FLOAT32(_own_large_value_32[0]).value.f;
538             }
539             else
540             {
541                 resultf = tv = FLOAT32(_own_large_value_32[(0)]).value.f * FLOAT32(_own_large_value_32[0]).value.f;
542             }
543 
544             return resultf;
545         }
546         else
547         {
548             //enters here only for rexp = 0
549             L_shift = 31;
550             R_shift = 1;
551             L_mask = ~(0xffffffffu >>  R_shift );
552             ur |= 0x00800000;
553             remain = ( ( ur << L_shift ) & L_mask ) | ( remain != 0 );
554             ur = ( ur >> R_shift );
555 
556         }
557     }
558     else
559     {
560         ur |= ( rexp << 23 );
561     }
562 
563     if( remain != 0 )
564     {
565         tv = ( ( (float *)_ones)[0] + ( (float *)_own_small_value_32)[0] );
566 
567         int cond3, cond4, cond5, cond6;
568 
569         switch( rm )
570         {
571         case ( 0 << 10 ):
572             cond3 = ( ( remain & 0x80000000 ) != 0 ) & ( ( ( ur & 1 ) != 0 ) |
573                     ( ( remain & ~0x80000000 ) != 0 ) );
574             if( cond3 != 0 )
575             {
576                 ur++;
577                 if( ur >= 0x7f800000 )
578                 {
579                     rsign = ( (unsigned)rsign >> 31 );
580                     if( rsign )
581                     {
582                         resultf = tv =
583                             ( ( (float *) _own_large_value_32)[1] *
584                             ( (float *) _own_large_value_32)[0] );
585                     }
586                     else
587                     {
588                         resultf = tv =
589                             (((float *) _own_large_value_32)[(0)] *
590                             ((float *) _own_large_value_32)[0]);
591                     }
592 
593                     return resultf;
594                 }
595             }
596 
597         case ( 3 << 10 ):
598             cond4 = ( ur < 0x00800000 ) |
599                     ( (ur == 0x00800000 ) & ( remain == 0x80000000 ) );
600 
601             if( cond4 != 0 )
602             {
603                 tv = ( ( ( float *)_own_small_value_32)[0] *
604                      ( ( float *)_own_small_value_32)[0] );
605             }
606             break;
607 
608         case ( 2 << 10 ):
609             cond5 = ( rsign & ( ur < 0x00800000 ) ) |
610                     ( (!rsign) & ( (ur < 0x007fffff ) | ( ( ur == 0x007fffff ) & ( remain < 0x80000000 ) ) ) );
611 
612             if( cond5 != 0 )
613             {
614                 tv = ( ( (float *)_own_small_value_32)[0] *
615                        ( (float *)_own_small_value_32)[0] );
616             }
617 
618             if( !rsign )
619             {
620                 ur++;
621                 if( ur >= 0x7f800000 )
622                 {
623                     //rsign = ((unsigned) rsign >> 31);
624                     resultf = tv = ( ( (float *)_own_large_value_32)[0] *
625                                      ( (float *)_own_large_value_32)[0] );
626                     return resultf;
627                 }
628             }
629             break;
630 
631         case ( 1 << 10 ):
632             cond6 = ( !rsign & ( ur < 0x00800000 ) ) |
633                     ( rsign & ( (ur < 0x007fffff ) | ( ( ur == 0x007fffff ) & ( remain < 0x80000000 ) ) ) );
634 
635             if( cond6 != 0 )
636             {
637                 tv = ( ( (float *)_own_small_value_32)[0] *
638                        ( (float *)_own_small_value_32)[0] );
639             }
640 
641             if( rsign )
642             {
643                 ur++;
644                 if (ur >= 0x7f800000 )
645                 {
646                     //rsign = ((unsigned) rsign >> 31);
647                     resultf = tv =
648                         ( ( (float *)_own_large_value_32)[1] *
649                           ( (float *)_own_large_value_32)[0] );
650 
651                     return resultf;
652                 }
653             }
654             break;
655         }
656     }
657 
658     resultf = FLOAT32( (DWORD) (rsign | ur ) ).value.f;
659 
660     return resultf;
661 }
662 
663 /*****************************************************************************\
664 Inline Function:
665     Float32SafeRSQRT
666 
667 Description:
668     Performs correctly rounded single precision reciprocal square root
669     operation taking care of floating point specials in software.
670 \*****************************************************************************/
Float32SafeRSQRT(const float arg,bool denormRetain)671 inline float Float32SafeRSQRT( const float arg, bool denormRetain )
672 {
673     static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES] =
674     {
675         FPU_FLOAT32_NAN,       // rsqrt( -inf )    = NaN
676         FPU_FLOAT32_NAN,       // rsqrt( -X )      = NaN  //but to be really OK,we should try to maintain the NaN payload
677         FPU_FLOAT32_NAN,       // rsqrt( -denorm ) = NaN  //but to be really OK,we should try to maintain the NaN payload
678         FPU_FLOAT32_NEG_INF,   // rsqrt( -0 )      = -inf
679         FPU_FLOAT32_POS_INF,   // rsqrt( +0 )      = +inf
680         FPU_FLOAT32_COMPUTE,   // rsqrt( +denorm)  = computed value
681         FPU_FLOAT32_COMPUTE,   // rsqrt( +X )      == computed value
682         FPU_FLOAT32_POS_ZERO,  // rsqrt( +inf )    == +0.0
683         FPU_FLOAT32_NAN        // rsqrt( NaN )     == NaN
684     };
685 
686     FPU_FLOAT_CLASS t1 = Float32GetClass( arg );
687 
688     FLOAT32 f32;
689     f32.value.u = RESULT[ t1 ];
690 
691     bool computeDenorms = denormRetain &&  Float32IsDenorm( arg );
692 
693     if ( !computeDenorms && t1 == FPU_FLOAT_CLASS_NEG_DENORM )
694     {
695         f32.value.u = FPU_FLOAT32_NEG_INF;
696     }
697     if ( !computeDenorms && t1 == FPU_FLOAT_CLASS_POS_DENORM )
698     {
699         f32.value.u = FPU_FLOAT32_POS_INF;
700     }
701 
702     if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
703     {
704         double darg = arg;
705         double s = sqrt(darg);      //double-precision square root
706         double result = 1.0 / s;    //double-precision division
707         return static_cast<float>(result);     //back to floats
708     }
709 
710     return f32.value.f;
711 }
712 
713 /*****************************************************************************\
714 Inline Function:
715     Float32SafeDivide
716 
717 Description:
718     Performs division taking care of floating point specials in software.
719 \*****************************************************************************/
Float32SafeDivide(const float arg1,const float arg2,const bool denormRetain)720 inline float Float32SafeDivide( const float arg1, const float arg2, const bool denormRetain )
721 {
722     // Table for handling IEEE 754 specials in division
723     //
724     //  a / b       -Inf    -X      -0      +0      +X      +Inf    NaN
725     //
726     //  -Inf        NaN     +Inf    +Inf    -Inf    -Inf    NaN     NaN
727     //  -X          +0      <div>   +Inf    -Inf    <div>   -0      NaN
728     //  -0          +0      +0      NaN     NaN     -0      -0      NaN
729     //  +0          -0      -0      NaN     NaN     +0      +0      NaN
730     //  +X          -0      <div>   -Inf    +Inf    <div>   +0      NaN
731     //  +Inf        NaN     -Inf    -Inf    +Inf    +Inf    NaN     NaN
732     //  NaN         NaN     NaN     NaN     NaN     NaN     NaN     NaN
733     //
734 
735     static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
736     //    -Inf                  -X                    -denorm               -0                    +0                    +denorm               +X                    +Inf                  NaN
737         { FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // -Inf
738         { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN      },  // -X
739         { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN      },  // -denorm
740         { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN      },  // -0
741         { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN      },  // +0
742         { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN      },  // +denorm
743         { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN      },  // +X
744         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // +Inf
745         { FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN     , FPU_FLOAT32_NAN      },  // NaN
746     };
747 
748     FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
749     FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
750 
751     FLOAT32 f32;
752     f32.value.u = RESULT[ t1 ][ t2 ];
753 
754     bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
755 
756     if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
757     {
758         return arg1 / arg2;
759     }
760 
761     return f32.value.f;
762 }
763 
764 /*****************************************************************************\
765 Inline Function:
766     Signed32SafeDivideQuotient
767 
768 Description:
769     Computes src0 divided by src1
770     Table for handling signed divide quotient and remainder:
771         IDIV            SRC0
772             SRC1        +INT            -INT             0
773             +INT        +INT            -INT             0
774             -INT        -INT            +INT             0
775               0     Q:0x7FFFFFFF    Q: 0x80000000   Q:0x7FFFFFFF
776                     R:0x7FFFFFFF    R: 0x80000000   R:0x7FFFFFFF
777 \*****************************************************************************/
Signed32SafeDivideQuotient(const signed long src0,const signed long src1)778 inline signed long Signed32SafeDivideQuotient(
779     const signed long src0,
780     const signed long src1 )
781 {
782     if( !src1 )
783     {
784         if( src0 < 0 )
785         {
786             return LONG_MIN;
787         }
788         return LONG_MAX;
789     }
790 
791     return src0 / src1;
792 }
793 
794 /*****************************************************************************\
795 Inline Function:
796     Signed32SafeDivideRemainder
797 
798 Description:
799     Computes remainder of src0 divided by src1
800 \*****************************************************************************/
Signed32SafeDivideRemainder(const signed long src0,const signed long src1)801 inline signed long Signed32SafeDivideRemainder(
802     const signed long src0,
803     const signed long src1 )
804 {
805     if( !src1 )
806     {
807         if( src0 < 0 )
808         {
809             return LONG_MIN;
810         }
811         return LONG_MAX;
812     }
813 
814     return src0 % src1;
815 }
816 
817 /*****************************************************************************\
818 Inline Function:
819     Unsigned32SafeDivideQuotient
820 
821 Description:
822     Computes src0 divided by src1
823        Table for handling unsigned divide quotient and remainder
824           UDIV          SRC0
825               SRC1      <>0             0
826               <>0       UINT            0
827                 0   Q:0xFFFFFFFF    Q:0xFFFFFFFF
828                     R:0xFFFFFFFF    R:0xFFFFFFFF
829 \*****************************************************************************/
Unsigned32SafeDivideQuotient(const DWORD src0,const DWORD src1)830 inline DWORD Unsigned32SafeDivideQuotient(
831     const DWORD src0,
832     const DWORD src1 )
833 {
834     if( !src1 )
835     {
836         return UINT_MAX;
837     }
838 
839     return src0 / src1;
840 }
841 
842 /*****************************************************************************\
843 Inline Function:
844     Unsigned32SafeDivideRemainder
845 
846 Description:
847     Computes remainder of src0 divided by src1
848 \*****************************************************************************/
Unsigned32SafeDivideRemainder(const DWORD src0,const DWORD src1)849 inline DWORD Unsigned32SafeDivideRemainder(
850     const DWORD src0,
851     const DWORD src1 )
852 {
853     if( !src1 )
854     {
855         return UINT_MAX;
856     }
857 
858     return src0 % src1;
859 }
860 
861 /*****************************************************************************\
862 Inline Function:
863     F32ToF16_d
864 
865 Description:
866     Float32 to float16 conversion based on "Fast Half Float Conversions"
867     by Jeroen van der Zijp
868 
869 Input:
870     32-bit DWORD represantation of float value
871 Output:
872     16-bit DWORD represantation of float value
873 
874 \*****************************************************************************/
F32ToF16_d(DWORD arg)875 inline WORD F32ToF16_d( DWORD arg )
876 {
877     static const WORD btbl[512] = {
878         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
879         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
880         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
881         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
882         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
883         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
884         0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
885         0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
886         0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7c00,
887         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
888         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
889         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
890         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
891         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
892         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
893         0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
894         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
895         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
896         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
897         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
898         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
899         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
900         0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
901         0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
902         0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfc00,
903         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
904         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
905         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
906         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
907         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
908         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
909         0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00
910     };
911     static const unsigned char stbl[512] = {
912         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
913         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
914         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
915         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
916         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
917         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
918         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
919         0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
920         0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
921         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
922         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
923         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
924         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
925         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
926         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
927         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x0d,
928         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
929         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
930         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
931         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
932         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
933         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
934         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
935         0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
936         0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
937         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
938         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
939         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
940         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
941         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
942         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
943         0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x0d
944     };
945     DWORD sexp = (arg>>23)&0x1ff;
946     return (WORD)(btbl[ sexp ]+( (arg&0x007fffff)>>stbl[ sexp ] ));
947 }
948 
949 /*****************************************************************************\
950 
951 Inline Function:
952     F32ToF16_f
953 
954 Description:
955     Float32 to float16 conversion based on "Fast Half Float Conversions"
956     by Jeroen van der Zijp
957 
958 Input:
959     32-bit float value
960 Output:
961     16-bit WORD represantation of float value
962 
963 \*****************************************************************************/
F32ToF16_f(float arg)964 inline WORD F32ToF16_f( float arg )
965 {
966     return F32ToF16_d( *(DWORD *)&arg );
967 }
968 
969 /*****************************************************************************\
970 
971 Inline Function:
972     F16ToF32
973 
974 Description:
975     Float16 to float32 conversion
976 
977 Input:
978     16-bit WORD representation of float16 value
979 Output:
980     32-bit DWORD represantation of float32 value
981 
982 \*****************************************************************************/
F16ToF32(WORD v)983 static inline DWORD F16ToF32( WORD v )
984 {
985     unsigned long index;
986     return
987         // is exponent!=0 ?
988         v & 0x7C00
989             // is exponent==max ?
990             ? ( v & 0x7C00 ) == 0x7C00
991                 // is mantissa!=0 ?
992                 ? v & 0x03FF
993                     // convert NaN
994                     ? ( ( v << 13 ) + 0x70000000 ) | 0x7f800000
995                     // convert infinities
996                     : ( v << 16 ) | 0x7f800000
997                 // convert normalized values
998                 : ( ( ( v << 13 ) + 0x70000000 ) & ~0x70000000 ) + 0x38000000
999             // is mantissa non-zero ?
1000             : v & 0x03FF
1001                 // convert denormalized values
1002                 ? index=bsr( v & 0x03FF ), ( ( ( ( v << 16 ) & 0x80000000 ) | ( ( v << 13 ) & 0xF800000 ) ) + 0x33800000 + ( index << 23 ) ) | ( ( ( v & 0x03FF ) << ( 23-index ) ) & ~0x800000 )
1003                 // convert zeros
1004                 : v << 16;
1005 }
1006 
1007 /*****************************************************************************\
1008 Inline Function:
1009     Float32SafeMax
1010 
1011 Description:
1012     MinMax of Floating Point Numbers.
1013 
1014 Input:
1015     arg1
1016     arg2
1017     isGen7
1018 
1019 Output:
1020     max( arg1, arg2 )
1021 
1022 \*****************************************************************************/
Float32SafeMax(const float arg1,const float arg2,bool isGen7)1023 inline float Float32SafeMax( const float arg1, const float arg2, bool isGen7 )
1024 {
1025     // Values of following arrays corresponds to results of sel.l instructions.
1026 
1027     static const bool RESULT_preGen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1028     //    -Inf   -X      -denorm   -0      +0   +denorm   +X      +Inf    NaN
1029         { true  , false , false , false , false , false , false , false , true      },  // -Inf
1030         { true  , false , false , false , false , false , false , false , true      },  // -X
1031         { true  , true  , true  , true  , true  , true  , false , false , true      },  // -denorm
1032         { true  , true  , true  , true  , true  , true  , false , false , true      },  // -0
1033         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +0
1034         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +denorm
1035         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +X
1036         { true  , true  , true  , true  , true  , true  , true  , true  , true      },  // +Inf
1037         { false , false , false , false , false , false , false , false , false     },  // NaN
1038     };
1039 
1040     static const bool RESULT_Gen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1041     //    -Inf   -X      -denorm   -0      +0   +denorm   +X      +Inf    NaN
1042         { true  , false , false , false , false , false , false , false , true      },  // -Inf
1043         { true  , false , false , false , false , false , false , false , true      },  // -X
1044         { true  , true  , true  , true  , true  , true  , false , false , true      },  // -denorm
1045         { true  , true  , true  , true  , false , true  , false , false , true      },  // -0
1046         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +0
1047         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +denorm
1048         { true  , true  , true  , true  , true  , true  , false , false , true      },  // +X
1049         { true  , true  , true  , true  , true  , true  , true  , true  , true      },  // +Inf
1050         { false , false , false , false , false , false , false , false , false     },  // NaN
1051     };
1052 
1053     const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
1054     const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
1055 
1056     if( ( t1 == FPU_FLOAT_CLASS_NEG_FINITE || t1 == FPU_FLOAT_CLASS_POS_FINITE ) &&
1057         ( t2 == FPU_FLOAT_CLASS_NEG_FINITE || t2 == FPU_FLOAT_CLASS_POS_FINITE ) )
1058     {
1059         return ( arg1 >= arg2 ) ? arg1 : arg2;
1060     }
1061 
1062     FLOAT32 f32;
1063 
1064     if( isGen7 )
1065     {
1066         f32.value.f = ( RESULT_Gen7[t1][t2] ) ? arg1 : arg2;
1067     }
1068     else
1069     {
1070         f32.value.f = ( RESULT_preGen7[t1][t2] ) ? arg1 : arg2;
1071     }
1072 
1073     return f32.value.f;
1074 }
1075 
1076 /*****************************************************************************\
1077 Inline Function:
1078     Float32SafeMin
1079 
1080 Description:
1081     MinMax of Floating Point Numbers.
1082 
1083 Input:
1084     arg1
1085     arg2
1086     isGen7
1087 
1088 Output:
1089     max( arg1, arg2 )
1090 
1091 \*****************************************************************************/
Float32SafeMin(const float arg1,const float arg2,bool isGen7)1092 inline float Float32SafeMin( const float arg1, const float arg2, bool isGen7 )
1093 {
1094     // Values of following arrays corresponds to results of sel.ge instruction.
1095 
1096     static const bool RESULT_preGen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1097     //    -Inf   -X      -denorm   -0      +0   +denorm   +X      +Inf    NaN
1098         { false , true  , true  , true  , true  , true  , true  , true  , true      },  // -Inf
1099         { false , false , true  , true  , true  , true  , true  , true  , true      },  // -X
1100         { false , false , false , false , false , false , true  , true  , true      },  // -denorm
1101         { false , false , false , false , false , false , true  , true  , true      },  // -0
1102         { false , false , false , false , false , false , true  , true  , true      },  // +0
1103         { false , false , false , false , false , false , true  , true  , true      },  // +denorm
1104         { false , false , false , false , false , false , false , true  , true      },  // +X
1105         { false , false , false , false , false , false , false , false , true      },  // +Inf
1106         { false , false , false , false , false , false , false , false , false     },  // NaN
1107     };
1108 
1109     static const bool RESULT_Gen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1110     //    -Inf   -X      -denorm   -0      +0   +denorm   +X      +Inf    NaN
1111         { false , true  , true  , true  , true  , true  , true  , true  , true      },  // -Inf
1112         { false , false , true  , true  , true  , true  , true  , true  , true      },  // -X
1113         { false , false , false , false , false , false , true  , true  , true      },  // -denorm
1114         { false , false , false , false , true  , false , true  , true  , true      },  // -0
1115         { false , false , false , false , false , false , true  , true  , true      },  // +0
1116         { false , false , false , false , false , false , true  , true  , true      },  // +denorm
1117         { false , false , false , false , false , false , false , true  , true      },  // +X
1118         { false , false , false , false , false , false , false , false , true      },  // +Inf
1119         { false , false , false , false , false , false , false , false , false     },  // NaN
1120     };
1121 
1122     const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
1123     const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
1124 
1125     if( ( t1 == FPU_FLOAT_CLASS_NEG_FINITE || t1 == FPU_FLOAT_CLASS_POS_FINITE ) &&
1126         ( t2 == FPU_FLOAT_CLASS_NEG_FINITE || t2 == FPU_FLOAT_CLASS_POS_FINITE ) )
1127     {
1128         return ( arg1 < arg2 ) ? arg1 : arg2;
1129     }
1130 
1131     FLOAT32 f32;
1132 
1133     if( isGen7 )
1134     {
1135         f32.value.f = ( RESULT_Gen7[t1][t2] ) ? arg1 : arg2;
1136     }
1137     else
1138     {
1139         f32.value.f = ( RESULT_preGen7[t1][t2] ) ? arg1 : arg2;
1140     }
1141 
1142     return f32.value.f;
1143 }
1144 
1145 /*****************************************************************************\
1146 Inline Function:
1147     FloatSaturate
1148 
1149 Description:
1150 
1151     For a floating-point destination type, the saturation target range is [0.0,
1152     1.0]. For a floating-point NaN, there is no "closest value"; any NaN
1153     saturates to 0.0. (...) Any floating-point number greater than 1.0,
1154     including +INF, saturates to 1.0. Any negative floating-point number,
1155     including -INF, saturates to 0.0. Any floating-point number in the range 0.0
1156     to 1.0 is not changed by saturation.
1157 
1158     -0.0 is changed to +0.0.
1159 
1160 Input:
1161     const float f
1162 
1163 Output:
1164     float
1165 
1166 \*****************************************************************************/
FloatSaturate(const float f)1167 inline float FloatSaturate( const float f )
1168 {
1169     switch( Float32GetClass( f ) )
1170     {
1171     case FPU_FLOAT_CLASS_NEG_INF:
1172     case FPU_FLOAT_CLASS_NEG_FINITE:
1173     case FPU_FLOAT_CLASS_NEG_DENORM:
1174     case FPU_FLOAT_CLASS_NEG_ZERO:
1175     case FPU_FLOAT_CLASS_POS_ZERO:
1176     case FPU_FLOAT_CLASS_NAN:
1177         return 0.f;
1178     case FPU_FLOAT_CLASS_POS_DENORM:
1179         return f;
1180     case FPU_FLOAT_CLASS_POS_FINITE:
1181         return ( f <= 1.f ) ? f : 1.f;
1182     case FPU_FLOAT_CLASS_POS_INF:
1183         return 1.f;
1184     default:
1185         ASSERT( 0 );
1186         return 0.f;
1187     }
1188 }
1189 
1190 } // namespace iSTD
1191